
# Data Cleaning

In [None]:
# Imports for GDrive
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
# Mounting the drive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#downloaded = drive.CreateFile({'id':'https://drive.google.com/file/d/17dnH3TDdLmxos83OTHEtXBCD5Tmu-L_k/view?usp=sharing'}) # replace the id with id of file you want to access
#downloaded.GetContentFile('h01-20201001-20201008.zip') 

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.listdir('/content/gdrive/Shared drives/') 

['BPPC Acads']

In [2]:
# Import statements
import zipfile
import os

import pandas as pd
import numpy as np 
import csv
import re
import string
import time
from datetime import datetime

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Extract zip file

def extract_zips(filename, temp_folder):
  zip_ref = zipfile.ZipFile(filename, 'r')
  zip_ref.extractall(temp_folder)
  zip_ref.close()

In [4]:
def create_filename(FILENAME, TEMP_FOLDER):
  FILES_NAMES = FILENAME.split(".")[0].split("/")[-1]
  FILES_NAMES = FILES_NAMES.rsplit("-", 1)[0]
  # FILES_NAMES = os.path.join(TEMP_FOLDER, FILES_NAMES)
  return FILES_NAMES

In [5]:
def get_date_string(FILESNAMES):
  date_string = FILESNAMES.split("-")[-1]
  return date_string

def strip_date(FILESNAMES):
  date_string = get_date_string(FILESNAMES)
  ob = datetime.strptime(date_string, "%Y%m%d")
  date_to_analyse = ob.strftime("%a %b %d")
  return date_to_analyse

In [6]:
# Constants
FILENAME = "/content/h01-20200818-10files.zip"
TEMP_FOLDER = "/tmp"
FILES_NAMES = create_filename(FILENAME, TEMP_FOLDER)
date_to_analyse = strip_date(FILES_NAMES)
# print(FILES_NAMES, date_to_analyse)

In [7]:
# Keep data in english only
def remove_other_langs(data):
  data = data[data['lang'] == 'en'].reset_index(drop=True)
  return data

In [8]:
# Keep specific date
def remove_other_dates(data, date_to_analyse):
  data = data[data['created_at'].str[:10] == date_to_analyse].reset_index(drop=True)
  return data

In [9]:
# Creating the RT Column
def create_rt_column(data):
  data['RT'] = data['text'].str[:2]=='RT'
  return data

In [10]:
# Parse CSV data
def parse_data_from_file(filename, date_to_analyse):
  data = pd.read_csv(filename, index_col = None, header=0, engine = 'python')
  data = remove_other_langs(data)
  data = remove_other_dates(data, date_to_analyse)
  data = create_rt_column(data)
  return data

In [11]:
# Parse all files of the same date
def parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse):
  files = os.listdir(TEMP_FOLDER)
  new_data = []
  for file in files:
    if file.startswith(FILES_NAMES):
      parsed_data = parse_data_from_file(os.path.join(TEMP_FOLDER, file), date_to_analyse)
      new_data.append(parsed_data)
  return pd.concat(new_data, axis = 0)

In [12]:
def get_text_or_extended_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])
  id = data["id"].to_dict()
  added_ids.update(id)
  return data["FINAL_TEXT"]

def get_quoted_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['QT_full_text'].notnull(), data["QT_full_text"], data["QT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["QT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_text_or_full_text_rt(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['RT_full_text'].notnull(), data["RT_full_text"], data["RT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["RT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_quote_rt_full(data, added_ids):
  data["FINAL_TEXT"] = get_text_or_full_text_rt(data, added_ids) + get_quoted_text(data, added_ids)
  return data["FINAL_TEXT"]



def get_quote_rt(data, is_quote, is_rt):
  data = data.loc[(data['is_quote_tweet'] == is_quote) & (data['RT'] == is_rt)]

  if not is_quote and not is_rt:
    added_ids.update(dict(zip(data["id"], data["is_quote_tweet"])))
    data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])

  if is_quote and not is_rt:
    data["FINAL_TEXT"] = np.where(data["QT_id"].isin(added_ids.keys()), get_text_or_extended_text(data, added_ids), get_quoted_text(data, added_ids))

  if not is_quote and is_rt:
    data["FINAL_TEXT"] = np.where(data["RT_id"].isin(added_ids.keys()), added_ids.update(data['id'].to_dict()), get_text_or_full_text_rt(data, added_ids))
    data = data[data["FINAL_TEXT"].notna()]

  if is_quote and is_rt:
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), added_ids.update(data['id'].to_dict()), None)
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quoted_text(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), get_text_or_full_text_rt(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quote_rt_full(data, added_ids), None)
    data = data[data["FINAL_TEXT"].notna()]

  return data

In [13]:
# Driver code
added_ids = {}
extract_zips(FILENAME, TEMP_FOLDER)
data = parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse)
print(data.shape, type(data))

(112410, 80) <class 'pandas.core.frame.DataFrame'>


In [19]:
# Check duplicates
# print(data["text"].nunique())
# print(data["extended_tweet_full_text"].nunique())
# print(data["QT_full_text"].nunique())
# print(data["QT_text"].nunique())
# print(data["RT_full_text"].nunique())
# print(data["RT_text"].nunique())

In [21]:
non_quote_non_rt = get_quote_rt(data, False, False)
print(non_quote_non_rt["FINAL_TEXT"], non_quote_non_rt.shape)

0        can we wear clone helmets instead of masks to ...
11       SC Senate to reconvene in Sept. to discuss COV...
13       @JohnAvlon During the COVID-19 crisis, Pres. T...
17       We just gonna act like @JamesStormBrand wasn’t...
18       The COVID-19 pandemic has created unprecedente...
                               ...                        
10874    #ReTurkey heading back to Hisaronu for 7th tim...
10876    @MollyJongFast SHE was wrong about the CV deat...
10880    "States with strict coronavirus lockdowns seem...
10882    @adamamin Does this mean that in mid-June of n...
10884    @ClayTravis @Outkick More evidence the Coronab...
Name: FINAL_TEXT, Length: 28689, dtype: object (28689, 81)


In [253]:
quote_non_rt = get_quote_rt(data, True, False)
print(quote_non_rt["FINAL_TEXT"], quote_non_rt.shape)

2        “Our forecast for the US economy ... reflects ...
3        Miami Dade’s antiquated storm drain system dum...
19       Our Department will be opening up many of its ...
31       I’m on a call with @SenBobCasey and @PAAttorne...
32       my bf never missed a apt❤️ except for today ca...
                               ...                        
10800    Police are preparing to launch their aerial ar...
10813    Mike Lindell, creator of MyPillow, is promotin...
10820    Rolling live coverage of the new restrictions ...
10847                               Wear\nA\nPhucking mask
10885    Like the Trump administration, Florida has emb...
Name: FINAL_TEXT, Length: 9171, dtype: object (9171, 81)


In [259]:
quote_rt = get_quote_rt(data, True, True)
print(quote_rt["FINAL_TEXT"], quote_rt.shape)

1        .@AndrewCuomo, thank you for your leadership a...
4        Gold Reef City  would make the money they’ve l...
7        North Carolina is now reporting they OVERCOUNT...
8        Data shows a low % of children in single paren...
9        Watch @iamCardiB and @JoeBiden talk police bru...
                               ...                        
10849    Ok, listen to this guy Carlos Piccata. \n"I'm ...
10850    Police are preparing to launch their aerial ar...
10871    UPDATE: Teachers in Arizona have staged a “sic...
10873    We've seen a high volume of music creators dis...
10878    'Anyone who's following COVID and its transmis...
Name: FINAL_TEXT, Length: 18227, dtype: object (18227, 81)


In [252]:
non_quote_rt = get_quote_rt(data, False, True)
print(non_quote_rt["FINAL_TEXT"], non_quote_rt.shape)

5        Me in Quarantine vs the story I'll tell my gra...
6        Trust us. This time the data is good.\n\n2020,...
10       I can’t believe China is partying and America ...
12       Me in Quarantine vs the story I'll tell my gra...
15       pregnancy and corona really neck and neck for ...
                               ...                        
10877    27 times Trump said the coronavirus would go a...
10881    It's starting to look as if Sweden got the bes...
10883    What is normal? Was everything we knew before ...
10886    OMG, college kids are going to test positive f...
10887    Anderson Cooper tears into the MyPillow guy fo...
Name: FINAL_TEXT, Length: 49887, dtype: object (49887, 81)
