
# Data Cleaning

In [None]:
# Imports for GDrive
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
# Mounting the drive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#downloaded = drive.CreateFile({'id':'https://drive.google.com/file/d/17dnH3TDdLmxos83OTHEtXBCD5Tmu-L_k/view?usp=sharing'}) # replace the id with id of file you want to access
#downloaded.GetContentFile('h01-20201001-20201008.zip') 

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.listdir('/content/gdrive/Shared drives/') 

['BPPC Acads']

In [48]:
!pip install pandas
!pip install numpy
!pip install nltk

# Import statements
import zipfile
import os

import pandas as pd
import numpy as np 
import csv
import re
import string
import time
from datetime import datetime

!pip install multiprocess
import multiprocess
from multiprocess import Pool, Process

!pip install emoji
import emoji

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyanshkedia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/priyanshkedia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyanshkedia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/priyanshkedia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
# Extract zip file

def extract_zips(filename, temp_folder):
  zip_ref = zipfile.ZipFile(filename, 'r')
  zip_ref.extractall(temp_folder)
  zip_ref.close()

In [6]:
def create_filename(FILENAME, TEMP_FOLDER):
  FILES_NAMES = FILENAME.split(".")[0].split("/")[-1]
  FILES_NAMES = FILES_NAMES.rsplit("-", 1)[0]
  # FILES_NAMES = os.path.join(TEMP_FOLDER, FILES_NAMES)
  return FILES_NAMES

In [7]:
def get_date_string(FILESNAMES):
  date_string = FILESNAMES.split("-")[-1]
  return date_string

def strip_date(FILESNAMES):
  date_string = get_date_string(FILESNAMES)
  ob = datetime.strptime(date_string, "%Y%m%d")
  date_to_analyse = ob.strftime("%a %b %d")
  return date_to_analyse

In [8]:
# Constants
FILENAME = "h01-20200818-10files.zip"
TEMP_FOLDER = "/tmp"
FILES_NAMES = create_filename(FILENAME, TEMP_FOLDER)
date_to_analyse = strip_date(FILES_NAMES)
# print(FILES_NAMES, date_to_analyse)

In [9]:
# Keep data in english only
def remove_other_langs(data):
  data = data[data['lang'] == 'en'].reset_index(drop=True)
  return data

In [10]:
# Keep specific date
def remove_other_dates(data, date_to_analyse):
  data = data[data['created_at'].str[:10] == date_to_analyse].reset_index(drop=True)
  return data

In [11]:
# Creating the RT Column
def create_rt_column(data):
  data['RT'] = data['text'].str[:2]=='RT'
  return data

In [12]:
# Parse CSV data
def parse_data_from_file(filename, date_to_analyse):
  data = pd.read_csv(filename, index_col = None, header=0, engine = 'python')
  data = remove_other_langs(data)
  data = remove_other_dates(data, date_to_analyse)
  data = create_rt_column(data)
  return data

In [13]:
# Parse all files of the same date
def parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse):
  files = os.listdir(TEMP_FOLDER)
  new_data = []
  for file in files:
    if file.startswith(FILES_NAMES):
      parsed_data = parse_data_from_file(os.path.join(TEMP_FOLDER, file), date_to_analyse)
      new_data.append(parsed_data)
  return pd.concat(new_data, axis = 0)

In [14]:
def get_text_or_extended_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])
  id = data["id"].to_dict()
  added_ids.update(id)
  return data["FINAL_TEXT"]

def get_quoted_text(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['QT_full_text'].notnull(), data["QT_full_text"], data["QT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["QT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_text_or_full_text_rt(data, added_ids):
  data["FINAL_TEXT"] = np.where(data['RT_full_text'].notnull(), data["RT_full_text"], data["RT_text"])
  added_ids.update(data["id"].to_dict())
  added_ids.update(data["RT_id"].to_dict())
  return data["FINAL_TEXT"]

def get_quote_rt_full(data, added_ids):
  data["FINAL_TEXT"] = get_text_or_full_text_rt(data, added_ids) + get_quoted_text(data, added_ids)
  return data["FINAL_TEXT"]



def get_quote_rt(data, is_quote, is_rt):
  data = data.loc[(data['is_quote_tweet'] == is_quote) & (data['RT'] == is_rt)]

  if not is_quote and not is_rt:
    added_ids.update(dict(zip(data["id"], data["is_quote_tweet"])))
    data["FINAL_TEXT"] = np.where(data['extended_tweet_full_text'].notnull(), data["extended_tweet_full_text"], data["text"])

  if is_quote and not is_rt:
    data["FINAL_TEXT"] = np.where(data["QT_id"].isin(added_ids.keys()), get_text_or_extended_text(data, added_ids), get_quoted_text(data, added_ids))

  if not is_quote and is_rt:
    data["FINAL_TEXT"] = np.where(data["RT_id"].isin(added_ids.keys()), added_ids.update(data['id'].to_dict()), get_text_or_full_text_rt(data, added_ids))
    data = data[data["FINAL_TEXT"].notna()]

  if is_quote and is_rt:
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), added_ids.update(data['id'].to_dict()), None)
    data["FINAL_TEXT"] = np.where((data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quoted_text(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (data["QT_id"].isin(added_ids.keys())), get_text_or_full_text_rt(data, added_ids), None)
    data["FINAL_TEXT"] = np.where((~data["RT_id"].isin(added_ids.keys())) & (~data["QT_id"].isin(added_ids.keys())), get_quote_rt_full(data, added_ids), None)
    data = data[data["FINAL_TEXT"].notna()]

  return data

In [15]:
# Driver code
added_ids = {}
extract_zips(FILENAME, TEMP_FOLDER)
data = parse_all_files(TEMP_FOLDER, FILES_NAMES, date_to_analyse)
print(data.shape, type(data))

(112410, 80) <class 'pandas.core.frame.DataFrame'>


In [16]:
# Check duplicates
# print(data["text"].nunique())
# print(data["extended_tweet_full_text"].nunique())
# print(data["QT_full_text"].nunique())
# print(data["QT_text"].nunique())
# print(data["RT_full_text"].nunique())
# print(data["RT_text"].nunique())

In [17]:
non_quote_non_rt = get_quote_rt(data, False, False)
print(non_quote_non_rt["FINAL_TEXT"], non_quote_non_rt.shape)

1        @CarolineHutt @GodfreyOSWALD @littlemissj74 @S...
4        This new platform is designed to be used by st...
5        @Georgia_Levan @emmakennytv Sorry, correct. I ...
6        @briduimhaoluala @redwenzen @DeeGilhawley @INT...
9        @DaveHopper19 @baestheorum @randyhillier I've ...
                               ...                        
11135    FingerprintJS is looking for a remote Full-sta...
11147    If the latest pandemic restrictions are causin...
11148    Nice work on Multisystem Inflammatory Syndrome...
11149    @pdobi Some people are pushing a plant extract...
11150    Lunch time! Having lunch with some of our 'co-...
Name: FINAL_TEXT, Length: 28689, dtype: object (28689, 81)


In [23]:
quote_non_rt = get_quote_rt(data, True, False)
print(quote_non_rt["FINAL_TEXT"], quote_non_rt.shape)

2        $RLFTF $RLF\nDHR Health Institute for Research...
8        pregnancy and corona really neck and neck for ...
14              This COVID shit lasting like a Honda Civic
25       Covid-19's long-term effects, especially in yo...
28       San Francisco, California: \n\nCOVID-19 denier...
                               ...                        
11180    UPDATE: COVID-19 cases in Ghana\n\nConfirmed -...
11195    Never thought I would see the day I was sharin...
11196    కావాల్సిన మందులు, పరికరాలు, ఇంజక్షన్లు, వెంటిల...
11202    @realDonaldTrump The Trump Virus is making Ame...
11228    Wuhan wave.\n\nPeople watch a performance as t...
Name: FINAL_TEXT, Length: 9171, dtype: object (9171, 81)


In [24]:
quote_rt = get_quote_rt(data, True, True)
print(quote_rt["FINAL_TEXT"], quote_rt.shape)

0        Thousands attend pool party in Wuhan, China, t...
10       Gov. Cuomo writing book about leading NY throu...
12       BREAKING: 71 of Mississippi's 82 counties are ...
15       .@NYGovCuomo's leadership was an American Cris...
16       'Cashback of Rs 50,000 if you catch COVID-19 w...
                               ...                        
11189    Lakhs of students asking you for NOT taking Ex...
11199    Join us Tuesday for an important back-to-schoo...
11200    Due to the laughable "recount" of deaths in th...
11220    As an ER doc, I've seen countless people die f...
11223       https://t.co/K25QaQYTtzhttps://t.co/K25QaQYTtz
Name: FINAL_TEXT, Length: 17873, dtype: object (17873, 81)


In [25]:
non_quote_rt = get_quote_rt(data, False, True)
print(non_quote_rt["FINAL_TEXT"], non_quote_rt.shape)

3                 Corona Bouncer 🙄 https://t.co/W26X5A6yQa
5        Taiwan's 1st coronavirus vaccine approved for ...
7        I'll be happy when Kenyans can get the vaccine...
9        Jyotsna Mohanty of BBSR GGP colony.Corona Cris...
11              This COVID shit lasting like a Honda Civic
                               ...                        
11218    Florida reported 26,164 COVID-19 tests today w...
11219    Happy that netball is back next month, but now...
11221    The best "own" that Trump has is that Michelle...
11226    Lessons from covid:\n\nLife is an absolute gif...
11227    it’s a good thing tattoo parlors are closed du...
Name: FINAL_TEXT, Length: 49887, dtype: object (49887, 81)


1. Casing (Upper or lower case)
2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
4. Stopword Removal
5. Text Normalization (Stemming and Lemmatization) bold text

In [38]:
#Removing emojis
def demoji(text):
  emoji_pattern = re.compile("["
  u"\U0001F600-\U0001F64F"  # emoticons
  u"\U0001F300-\U0001F5FF"  # symbols & pictographs
  u"\U0001F680-\U0001F6FF"  # transport & map symbols
  u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
  u"\U00002702-\U000027B0"
  u"\U000024C2-\U0001F251"
  u"\U00010000-\U0010ffff"
                        "]+", flags=re.UNICODE)
  return(emoji_pattern.sub(r'', text))

#POSTags
def get_wordnet_pos(word):
  """Map POS tag to first character lemmatize() accepts"""
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

#Remove URLs, user@, punctutions
def df_cleaning(data_):
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.replace(r"http\S+| www\S+| https\S+| \S+\.com\S+| \S+\.com| \@[\w]+", "", regex=True)

  # ##################-------Punctutions-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.translate(str.maketrans("", "", string.punctuation))
 
  # ##################-------More Cleaning-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.replace("/[^a-zA-Z0-9 ]/g", "", regex=True).str.replace("\n"," ", regex=True).str.replace("—"," ", regex=True).str.strip("“").str.strip("”").str.strip("’").str.lstrip(" ").str.rstrip(" ")
  
  # ##################-------Emojis-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x:demoji(x))
  
  # ##################-------Tokenizing-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str)
  # data["FINAL_TEXT"] = data["FINAL_TEXT"].apply(nltk.word_tokenize)
  data_["FINAL_TEXT"] = data_.apply(lambda row: nltk.word_tokenize(row.FINAL_TEXT), axis=1)
  
  # ##################-------Removing stopwords-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x: [word for word in x if word not in stop_words])
  
  # ##################-------Stemming-------##################
  ps = PorterStemmer()
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).apply(lambda x: [ps.stem(y) for y in x])
 
  # ##################-------Lemmatizing-------##################
#   lemmatizer = WordNetLemmatizer()
#   data["FINAL_TEXT"] = data["FINAL_TEXT"].astype(str).apply(lambda x: [lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in x])
  
  # ##################-------Joining the lemmetized tokens to form string-------##################
#   data["FINAL_TEXT"] = data["FINAL_TEXT"].astype(str).apply(lambda x: " ".join([word for word in x]))
 
  # ##################-------Remove punctuations-------##################
  data_["FINAL_TEXT"] = data_["FINAL_TEXT"].astype(str).str.translate(str.maketrans("", "", string.punctuation)).str.replace("’", " ").str.replace("“", " ").str.replace("”", " ")
 

  return data_


In [60]:
a = non_quote_non_rt.copy()
b = non_quote_non_rt.copy()

In [61]:
# df_cleaning(non_quote_non_rt, quote_non_rt, quote_rt, non_quote_rt)
start = time.time()
resul = df_cleaning(a)
end = time.time()
print("time = ", end - start)

time =  18.805187940597534


In [62]:
def split_dataframe(df, nums = 4): 
    chunks = list()
    num_chunks = nums
    chunk_size = len(df) // nums
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [63]:
print(multiprocess.cpu_count())
df = split_dataframe(b, 8)

# for i in range(4): 
#   df.iloc[i * 25: i* 25+25, 0] = data[i]
  # p.close()
  # p.join()

8


In [68]:
start = time.time()
with Pool(8) as p:
  data = p.map(df_cleaning, df)
  p.close()
  p.join()
    
end = time.time()
print("time = ", end - start)

time =  6.200814247131348


In [69]:

processes = []
start = time.time()

for i in range(8):
  p = Process(target = df_cleaning, args=(df[i],))
  processes.append(p)
  p.start()

for p in processes: p.join()
    
end = time.time()
print("time = ", end - start)

time =  4.1698899269104
