In [144]:
# Import relevant packages 

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import statsmodels.formula.api as smf

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer # Porter stemmer is used in the slides example 
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from collections import defaultdict

# for date time 
import datetime as dt


# Advanced Social Data Science 2 (ASDS2) Exercises


## April 21: Preprocessing

### 1: Importing data without preprocessing

1. Download the data set available here, which contains the nearly 6,000 times Donald Trump insulted someone on Twitter: https://www.kaggle.com/ayushggarg/all-trumps-twitter-insults-20152021 
2. Load the csv as a data frame using pandas.
3. The variable ‘target’ has an indicator for the target of the insult. The data reveals that Trump’s most frequent insult target is ‘the media’ (‘the-media’ in the data). Create a binary indicator for whether Trump targets the media. Fit a linear regression with this binary indicator as the dependent variable and the date of the tweet as the independent variable. Does Trump become more or less likely to insult the media over time? Why might this be? 
4. Using the CountVectorizer from sklearn, convert the tweets to a document-feature matrix. What are the dimensions of the matrix?


In [145]:
# 2 - load csv file with pandas 

df=pd.read_csv('trump_insult_tweets_2014_to_2021.csv',index_col = 0)
#df.drop(['Unnamed: 0'], axis='columns', inplace=True)
df

Unnamed: 0,date,target,insult,tweet
1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ..."
2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ..."
3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...
4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...
5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily..."
...,...,...,...,...
10356,2021-01-06,2020-election,Many States want to decertify the mistake they...,If Vice President @Mike_Pence comes through fo...
10357,2021-01-06,2020-election,"based on irregularities and fraud, plus corrup...","States want to correct their votes, which they..."
10358,2021-01-06,2020-election,Our Election Process is worse than that of thi...,"They just happened to find 50,000 ballots late..."
10359,2021-01-06,2020-election,a FRAUD,The States want to redo their votes. They foun...


In [146]:
### 3a) Create a binary indicator for whether Trump targets the media 

# Creating the new indicator varible 
df['media_target'] = [1 if obs == 'the-media' else 0 for obs in df.target]

# df['media_target'] = df['target'].apply(lambda x: 1 if x == 'the-media' else 0)

# counting the instaces : 1287 times the rants were tageget towards the media 
print(df['media_target'].value_counts())
print()
print(df['target'].value_counts())


0    9073
1    1287
Name: media_target, dtype: int64

the-media                       1287
democrats                        647
hillary-clinton                  625
trump-russia                     441
joe-biden                        402
                                ... 
amy-klobuchar                      1
civil-war-memorial-opponents       1
kasie-hunt                         1
first-100-days                     1
abc-politics                       1
Name: target, Length: 866, dtype: int64


In [147]:
# 3b)  Fit a linear regression with this binary indicator as the dependent variable 
#      and the date of the tweet as the independent variable. 
#      Does Trump become more or less likely to insult the media over time? Why might this be? MORE


# make the date column into date/time and then ordinal (for it to work with linerarregression())
df['date'] = pd.to_datetime(df['date'])
df['Date'] = df['date'].map(dt.datetime.toordinal)


# df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d', errors = 'ignore')     : from solution 
 

#Create the feature and target vectors
X = df['date'].values.reshape(-1, 1)
y = df['media_target'].values

# fitting the linear regression, reshaping values to make it run 
model=LinearRegression().fit(X,y)

# inspect resutls 
print('coef :', model.coef_[0])
print('intercept : ',model.intercept_)





coef : 4.9882883785881e-19
intercept :  -0.6419861722097603


In [148]:
# 4) Using the CountVectorizer from sklearn, convert the tweets to a document-feature matrix.
#    What are the dimensions of the matrix?

cv = CountVectorizer(ngram_range=(1,1))
matrix = cv.fit_transform(df["tweet"])
matrix.shape

print('Matrix dimensions: 10360 x 10057')

matrix.shape

Matrix dimensions: 10360 x 10057


(10360, 10057)

In [149]:
#### FROM SOLUTION 


#Creating and fitting a vectorizer to convert the tweets to a document-feature matrix
vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,1), analyzer = "word")

matrix = vectorizer.fit_transform(df['tweet'])

#Viewing the matrix dimensions
matrix.shape


(10360, 12902)

### 2: Preprocessing steps

1. Remove all tagged users, i.e. words starting with the ‘@’ character.
2. Lowercase all tweet text.
3. Remove numbers.
4. Remove punctuation. 
5. Remove extra whitespaces.
6. Remove default stopwords.
7. Stem words.
8. Lemmatize words.


In [150]:
''' ### cleaning the tweets 

# make a string with all the tweets 
tweets = df['tweet'].dropna()

def cleaner(text):
    
    # 1)  remove all tagged users  
    text = re.sub('\s?@[^\s]+','',text) # whiteapse before &?  ( old:  \s([@][\w_-]+) ) 
    
    # 2) lowercase text 
    text = text.lower()
    
    # 3) numbers 
    text = re.sub('[0-9]','',text)
    
    # 4) remove puntuation 
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    
    # 5) whitespaces 
    text = re.sub('\s\s+', ' ', text) # flags=re.I ? 
    
    # Extra: URLs
    text = re.sub(r"http\S+", "",text)

    # Extra: Replacing Single Characters with Empty String
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
############################### Tokenization #########################
    
    # tokenize words
    text = word_tokenize(text)
    
    # 6) stopwords 
    text = [word for word in text if not word in stopwords.words()]

    
    return text

# Text Preprocessing
cleanTweets = tweets.apply(lambda x : cleaner(x))


# comparing o random tweet 
print('Raw tweet:', tweets[4])
print('Cleaned tweet:', cleanTweets[4]) '''

' ### cleaning the tweets \n\n# make a string with all the tweets \ntweets = df[\'tweet\'].dropna()\n\ndef cleaner(text):\n    \n    # 1)  remove all tagged users  \n    text = re.sub(\'\\s?@[^\\s]+\',\'\',text) # whiteapse before &?  ( old:  \\s([@][\\w_-]+) ) \n    \n    # 2) lowercase text \n    text = text.lower()\n    \n    # 3) numbers \n    text = re.sub(\'[0-9]\',\'\',text)\n    \n    # 4) remove puntuation \n    text = text.translate(str.maketrans(\'\', \'\', string.punctuation)) \n    \n    # 5) whitespaces \n    text = re.sub(\'\\s\\s+\', \' \', text) # flags=re.I ? \n    \n    # Extra: URLs\n    text = re.sub(r"http\\S+", "",text)\n\n    # Extra: Replacing Single Characters with Empty String\n    text = re.sub(r\'\\s+[a-zA-Z]\\s+\', \' \', text)\n    \n############################### Tokenization #########################\n    \n    # tokenize words\n    text = word_tokenize(text)\n    \n    # 6) stopwords \n    text = [word for word in text if not word in stopwords.words()

In [151]:
### from solution 

#Remove all tagged users, i.e. words starting with the ‘@’ character.
df['tweet_no_tags'] = df['tweet'].apply(lambda x: re.sub(r'@\w+ ', '', x)) 
    #Notice that there may be an issue here: If Trump mistakenly put a space after @ (e.g. '@ CBS' in index 576) the tag is not removed

#Lowercase all tweet text
df['tweet_lowercase'] = df['tweet_no_tags'].apply(lambda x: x.lower())

#Remove numbers 
df['tweet_no_numbers'] = df['tweet_lowercase'].apply(lambda x: re.sub(r'[0-9]','', x))

#Remove punctuation 
df['tweet_no_punc'] = df['tweet_no_numbers'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#Remove whitespace
df['tweet_no_whitespace'] = df['tweet_no_punc'].apply(lambda x: x.strip())
df['tweet_no_whitespace'] = df['tweet_no_whitespace'].apply(lambda x: re.sub(' +',' ',x))


df

Unnamed: 0,date,target,insult,tweet,media_target,Date,tweet_no_tags,tweet_lowercase,tweet_no_numbers,tweet_no_punc,tweet_no_whitespace
1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ...",0,735515,"Can you believe this fool, Dr. Thomas Frieden ...","can you believe this fool, dr. thomas frieden ...","can you believe this fool, dr. thomas frieden ...",can you believe this fool dr thomas frieden of...,can you believe this fool dr thomas frieden of...
2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ...",0,735515,"Can you believe this fool, Dr. Thomas Frieden ...","can you believe this fool, dr. thomas frieden ...","can you believe this fool, dr. thomas frieden ...",can you believe this fool dr thomas frieden of...,can you believe this fool dr thomas frieden of...
3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...,0,735765,Big time in U.S. today - MAKE AMERICA GREAT AG...,big time in u.s. today - make america great ag...,big time in u.s. today - make america great ag...,big time in us today make america great again...,big time in us today make america great again ...
4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...,0,735773,Politician didn't like that I said Baltimore n...,politician didn't like that i said baltimore n...,politician didn't like that i said baltimore n...,politician didnt like that i said baltimore ne...,politician didnt like that i said baltimore ne...
5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily...",0,735773,"For the nonbeliever, here is a photo of in my ...","for the nonbeliever, here is a photo of in my ...","for the nonbeliever, here is a photo of in my ...",for the nonbeliever here is a photo of in my o...,for the nonbeliever here is a photo of in my o...
...,...,...,...,...,...,...,...,...,...,...,...
10356,2021-01-06,2020-election,Many States want to decertify the mistake they...,If Vice President @Mike_Pence comes through fo...,0,737796,"If Vice President comes through for us, we wil...","if vice president comes through for us, we wil...","if vice president comes through for us, we wil...",if vice president comes through for us we will...,if vice president comes through for us we will...
10357,2021-01-06,2020-election,"based on irregularities and fraud, plus corrup...","States want to correct their votes, which they...",0,737796,"States want to correct their votes, which they...","states want to correct their votes, which they...","states want to correct their votes, which they...",states want to correct their votes which they ...,states want to correct their votes which they ...
10358,2021-01-06,2020-election,Our Election Process is worse than that of thi...,"They just happened to find 50,000 ballots late...",0,737796,"They just happened to find 50,000 ballots late...","they just happened to find 50,000 ballots late...","they just happened to find , ballots late last...",they just happened to find ballots late last ...,they just happened to find ballots late last n...
10359,2021-01-06,2020-election,a FRAUD,The States want to redo their votes. They foun...,0,737796,The States want to redo their votes. They foun...,the states want to redo their votes. they foun...,the states want to redo their votes. they foun...,the states want to redo their votes they found...,the states want to redo their votes they found...


In [158]:
### 6) Remove default stopwords

def remove_stopwords(sent):
    
    patterns = set(stopwords.words('english'))

    for pattern in patterns:
        if re.search(' '+pattern+' ', sent):           #Searching for stopwords bounded by whitespace in each tweet
            sent = re.sub(' '+pattern+' ', ' ', sent)  #Substituting stopwords with whitespace

    return sent

df['tweet_no_stopwords'] = df['tweet_no_whitespace'].apply(lambda x: remove_stopwords(x))


### 7) Stem words

def stemmer(sent):
    stemmer = PorterStemmer()         #Creating stemmer
    
    sent = word_tokenize(sent)        #Tokenizing, as stemmer only takes tokenized sentences
    sent_stemmed = []                 #Empty list to save stemmed sentence
    
    for word in sent:
        stem = stemmer.stem(word) #Stemming words
        sent_stemmed.append(stem)

    return ' '.join(sent_stemmed)

df['tweet_stemmed'] = df['tweet_no_stopwords'].apply(lambda x: stemmer(x))



### 8) Lemmatize words

def lemmatize(sent):
    
    #First, the nltk wordnet lemmatizer needs the part-of-speech (POS) tag to correctly lemmatize
    #NLTK has a POS-tagger, but the format does not match POS-tags in wordnet's lemmatizer. 
    #The mapping dictionary below fixes that.
    
    tag_map = defaultdict(lambda : wordnet.NOUN)  #If nothing else is specified, use noun tag
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV    
    
    lemmatizer = WordNetLemmatizer()        #Creating lemmatizer.
    
    sent = word_tokenize(sent)              #Tokenizing, as lemmatizer only takes tokenized sentences
    sent_lemmatized = []                    #Empty list to save lemmatized sentence

    for word, tag in pos_tag(sent):
        lemma = lemmatizer.lemmatize(word, tag_map[tag[0]])  #Where the magic happens
        sent_lemmatized.append(lemma)
    
    return ' '.join(sent_lemmatized)

df['tweet_lemmatized'] = df['tweet_no_stopwords'].apply(lambda x: lemmatize(x))


df

Unnamed: 0,date,target,insult,tweet,media_target,Date,tweet_no_tags,tweet_lowercase,tweet_no_numbers,tweet_no_punc,tweet_no_whitespace,tweet_no_stopwords,tweet_stemmed,tweet_lemmatized
1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ...",0,735515,"Can you believe this fool, Dr. Thomas Frieden ...","can you believe this fool, dr. thomas frieden ...","can you believe this fool, dr. thomas frieden ...",can you believe this fool dr thomas frieden of...,can you believe this fool dr thomas frieden of...,can believe fool dr thomas frieden cdc stated ...,can believ fool dr thoma frieden cdc state any...,can believe fool dr thomas frieden cdc state a...
2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ...",0,735515,"Can you believe this fool, Dr. Thomas Frieden ...","can you believe this fool, dr. thomas frieden ...","can you believe this fool, dr. thomas frieden ...",can you believe this fool dr thomas frieden of...,can you believe this fool dr thomas frieden of...,can believe fool dr thomas frieden cdc stated ...,can believ fool dr thoma frieden cdc state any...,can believe fool dr thomas frieden cdc state a...
3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...,0,735765,Big time in U.S. today - MAKE AMERICA GREAT AG...,big time in u.s. today - make america great ag...,big time in u.s. today - make america great ag...,big time in us today make america great again...,big time in us today make america great again ...,big time us today make america great politicia...,big time us today make america great politicia...,big time u today make america great politician...
4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...,0,735773,Politician didn't like that I said Baltimore n...,politician didn't like that i said baltimore n...,politician didn't like that i said baltimore n...,politician didnt like that i said baltimore ne...,politician didnt like that i said baltimore ne...,politician didnt like said baltimore needs job...,politician didnt like said baltimor need job s...,politician didnt like say baltimore need job s...
5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily...",0,735773,"For the nonbeliever, here is a photo of in my ...","for the nonbeliever, here is a photo of in my ...","for the nonbeliever, here is a photo of in my ...",for the nonbeliever here is a photo of in my o...,for the nonbeliever here is a photo of in my o...,for nonbeliever photo office request—total hyp...,for nonbeliev photo offic request—tot hypocrit...,for nonbeliever photo office request—total hyp...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10356,2021-01-06,2020-election,Many States want to decertify the mistake they...,If Vice President @Mike_Pence comes through fo...,0,737796,"If Vice President comes through for us, we wil...","if vice president comes through for us, we wil...","if vice president comes through for us, we wil...",if vice president comes through for us we will...,if vice president comes through for us we will...,if vice president comes us win presidency many...,if vice presid come us win presid mani state w...,if vice president come u win presidency many s...
10357,2021-01-06,2020-election,"based on irregularities and fraud, plus corrup...","States want to correct their votes, which they...",0,737796,"States want to correct their votes, which they...","states want to correct their votes, which they...","states want to correct their votes, which they...",states want to correct their votes which they ...,states want to correct their votes which they ...,states want correct votes know based irregular...,state want correct vote know base irregular fr...,state want correct vote know base irregularity...
10358,2021-01-06,2020-election,Our Election Process is worse than that of thi...,"They just happened to find 50,000 ballots late...",0,737796,"They just happened to find 50,000 ballots late...","they just happened to find 50,000 ballots late...","they just happened to find , ballots late last...",they just happened to find ballots late last ...,they just happened to find ballots late last n...,they happened find ballots late last night usa...,they happen find ballot late last night usa em...,they happen find ballots late last night usa e...
10359,2021-01-06,2020-election,a FRAUD,The States want to redo their votes. They foun...,0,737796,The States want to redo their votes. They foun...,the states want to redo their votes. they foun...,the states want to redo their votes. they foun...,the states want to redo their votes they found...,the states want to redo their votes they found...,the states want redo votes found voted fraud l...,the state want redo vote found vote fraud legi...,the state want redo vote find vote fraud legis...


### 3: Consequences of preprocessing

Create a new document-feature matrix with the preprocessed tweets. How do the dimensions of this matrix compare with those of the matrix you created in 1.3?


In [160]:
matrix = vectorizer.fit_transform(df['tweet_lemmatized'])
matrix.shape

(10360, 8575)

In [161]:
matrix = vectorizer.fit_transform(df['tweet_stemmed'])
matrix.shape

(10360, 7565)

In [163]:
#To see the new bag of words, uncomment below
#vectorizer.get_feature_names()