In [11]:
# utilities
import re
import numpy as np
import pandas as pd
import string
import nltk

# nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# data
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
def pre_processing(tw):
  # LEMMATIZATION - reducing the derived words to their root form
  dataset=tw

  # make everything lower case
  dataset['text']=dataset['text'].str.lower()

  # remove stopwords
  dataset['text'] = dataset['text'].apply(lambda text: cleaning_stopwords(text))

  # remove punctuation
  dataset['text']= dataset['text'].apply(lambda x: cleaning_punctuations(x))

  # take out repearting character
  dataset['text'] = dataset['text'].apply(lambda x: cleaning_repeating_char(x))

  # take out numerical value
  dataset['text'] = dataset['text'].apply(lambda x: cleaning_numbers(x))

  tokenizer = RegexpTokenizer('\s+',gaps=True)
  dataset['text'] = dataset['text'].apply(tokenizer.tokenize)

  # stemming
  dataset['text']= dataset['text'].apply(lambda x: stemming_on_text(x))

  # tag part of speech for each word
  dataset['text']=pos_tag_sents(dataset['text'].tolist())

  # lemmatization
  dataset['text']=dataset['text'].apply(lambda x: prep_lemmatization(x)) # prep right part of speech before lemmatization step
  dataset['text']=dataset['text'].apply(lambda x: lemmatizer_on_text(x))


  return dataset
#===================================================================================================================================

stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
          'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
          'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
          'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
          'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
          'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
          'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
          'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
          'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
          't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
          'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
          'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
          'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
          'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
          "youve", 'your', 'yours', 'yourself', 'yourselves','u','r',
          'im','ill','today','day','tomorrow','weekend','yesterday','go','te']

STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in STOPWORDS])

#===================================================================================================================================

english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

#===================================================================================================================================
# take out repeating characters 
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

# take out numerical values
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
#===================================================================================================================================
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return text

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

def prep_lemmatization(data):
  text=list(map(lambda x: (x[0], pos_tagger(x[1])), data))
  return text

lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word, tag) for word,tag in data if tag != None]
    return text

#===================================================================================================================================
def token_to_string(token_list):
  text=" ".join(token_list)
  return text

In [13]:
# load data
# train=pd.read_csv("/content/drive/MyDrive/WPI/CS 539 Final Project/Data Split/train.csv")
# test=pd.read_csv("/content/drive/MyDrive/WPI/CS 539 Final Project/Data Split/test.csv")
# valid=pd.read_csv("/content/drive/MyDrive/WPI/CS 539 Final Project/Data Split/validate.csv")
elon_tw=pd.read_csv("/content/drive/MyDrive/WPI/CS 539 Final Project/Elon Musk Tweets/Elon Musk Tweets.csv")

In [18]:
elon_tw1=pre_processing(elon_tw)
elon_tw1

Unnamed: 0,#,text,target
0,1,"[record, number, user, log, see, twitter, dead...",0
1,2,"[make, small, fortun, social, medium, start, l...",1
2,3,"[dont, wanna, jinx, chanc, keep, twitter, aliv]",1
3,4,"[best, peopl, stay, not, super, worri]",1
4,5,"[hit, alltim, high, twitter, usag, lol]",1
...,...,...,...
99,100,"[weve, make, question, tweet, want, clear, sup...",1
100,101,"[good, point, abl, select, version, twitter, w...",1
101,102,"[twitter, form, content, moder, council, wide,...",1
102,103,"[new, york, time, emerg, new, chaotic, actor, ...",0


In [None]:
train1=pre_processing(train)
train1

Unnamed: 0,id,date,flag,user,text,target
0,1982498874,Sun May 31 11:19:51 PDT 2009,NO_QUERY,Sun_Immaculate,"[miss, cut, grass, cut, grass, burn, smell, sm...",0.0
1,2052542788,Sat Jun 06 01:05:28 PDT 2009,NO_QUERY,cookiemonster82,"[chang, vote, wont, bat, today, co, rain, cric...",1.0
2,2072011839,Sun Jun 07 20:15:00 PDT 2009,NO_QUERY,davidallentv,"[ty, link, boost, even, tho, didnt, copi, whol...",1.0
3,2266222859,Sun Jun 21 08:07:29 PDT 2009,NO_QUERY,babyyg,"[whatt, feeel]",0.0
4,1973430757,Sat May 30 11:07:59 PDT 2009,NO_QUERY,aoitshirts,"[circl, triangl]",0.0
...,...,...,...,...,...,...
959995,1692515784,Sun May 03 20:09:50 PDT 2009,NO_QUERY,jgonzalez14,"[fleuri, im, get, old, im, not, hip, anymor, h...",0.0
959996,2056733151,Sat Jun 06 11:41:32 PDT 2009,NO_QUERY,cocovelvett,"[land, figur, ride, back, car]",0.0
959997,1976861893,Sat May 30 19:20:24 PDT 2009,NO_QUERY,AutismIsARose,"[get, off, work, return, apartmentturn, tv, sa...",1.0
959998,2169479730,Sun Jun 14 14:34:20 PDT 2009,NO_QUERY,sheepeatingtaz,"[far, much, wine, lose, chunk, day]",0.0


In [None]:
valid1=pre_processing(valid)
valid1

Unnamed: 0,id,date,flag,user,text,target
0,2247642316,Fri Jun 19 19:24:17 PDT 2009,NO_QUERY,jLyNeTtE,"[dontyouh, da, shit, wanna, say, long, charact]",0.0
1,2067384118,Sun Jun 07 12:14:56 PDT 2009,NO_QUERY,balbuenito,"[omg, ive, alway, want, see, wick, alreadi, do]",0.0
2,2055694237,Sat Jun 06 09:47:10 PDT 2009,NO_QUERY,kellynadams,"[finish, sit, fml]",0.0
3,1981407626,Sun May 31 09:05:18 PDT 2009,NO_QUERY,TwitjobsMedia,[thank],1.0
4,2061083242,Sat Jun 06 20:08:42 PDT 2009,NO_QUERY,KristenCampisi,"[know, way, get, hot, right]",1.0
...,...,...,...,...,...,...
319995,2051142502,Fri Jun 05 19:56:28 PDT 2009,NO_QUERY,Claire_BX,"[gonna, stay, tonight, lay, bed, watch, movi]",1.0
319996,2001497893,Tue Jun 02 01:04:52 PDT 2009,NO_QUERY,sallismoney,"[hit, suspect, ignor, lol]",1.0
319997,1974429008,Sat May 30 13:07:05 PDT 2009,NO_QUERY,websitejeff,"[case, first, round]",1.0
319998,2182403411,Mon Jun 15 13:07:24 PDT 2009,NO_QUERY,jazzieturtle,"[grandpa, hous, swim, lil, bit, idkyet, lol]",1.0


In [19]:
# convert list text back to string
# train1['text'] = train1['text'].apply(lambda x: token_to_string(x))
# test1['text'] = test1['text'].apply(lambda x: token_to_string(x))
# valid1['text'] = valid1['text'].apply(lambda x: token_to_string(x))
elon_tw1['text'] = elon_tw1['text'].apply(lambda x: token_to_string(x))

In [21]:
#Save preprocessing data
path = "/content/drive/MyDrive/WPI/CS 539 Final Project/Elon Musk Tweets/Elon Musk Tweets preprocessing.csv"
with open(path, 'w') as f:
  elon_tw1.to_csv(f, index=False)
# path = "/content/drive/MyDrive/WPI/CS 539 Final Project/Data Split/train_preprocessing.csv"
# with open(path, 'w') as f:
#   train1.to_csv(f, index=False)
# path = "/content/drive/MyDrive/WPI/CS 539 Final Project/Data Split/valid_preprocessing.csv"
# with open(path, 'w') as f:
#   valid1.to_csv(f, index=False)