In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import pycountry
import pickle
import us

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer 

from gensim.models import Word2Vec

from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

%matplotlib inline

%config IPCompleter.greedy=True

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/linhnguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linhnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/linhnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModuleNotFoundError: No module named 'gensim'

In [2]:
# Take a look at the output
stimulus_raw = pd.read_csv("stimulus_raw.csv")

# Print the first five rows
display(stimulus_raw.head())

# Print the summary statistics
print(stimulus_raw.describe())

# Print the info
print(stimulus_raw.info())

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Content,Location,Username,Retweet-Count,Favorites,Created at
0,0,0,@ZupancicJareen Follow #BidenLies #BidenLied a...,,Trish22758076,0,0,2021-03-30 22:59:04
1,1,1,Link to get :👇👇👇\nhttps://t.co/kRoIosUS6y\n#Tr...,Etats-Unis,Hicham21940587,1,0,2021-03-30 22:48:48
2,2,2,#Trump 🇺🇸\nWaPo Fact-Checkers Slam #BidenLied ...,"Hyères, France",C_W_UK,0,0,2021-03-30 22:47:54
3,3,3,YouTube doing away with dislikes bc of Whiteho...,"Clown World, USA",BEcAMearekonING,0,0,2021-03-30 22:46:12
4,4,4,Everyone knows this is happening to protect #B...,,BilukCyril,0,0,2021-03-30 22:39:17


        Unnamed: 0  Unnamed: 0.1  Retweet-Count     Favorites
count  8084.000000   8084.000000    8084.000000   8084.000000
mean   4041.500000    716.413409       2.830901     11.922934
std    2333.794121    435.993453     117.158487    552.269555
min       0.000000      0.000000       0.000000      0.000000
25%    2020.750000    336.000000       0.000000      0.000000
50%    4041.500000    691.000000       0.000000      0.000000
75%    6062.250000   1095.000000       0.000000      2.000000
max    8083.000000   1499.000000   10016.000000  47094.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8084 entries, 0 to 8083
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     8084 non-null   int64 
 1   Unnamed: 0.1   8084 non-null   int64 
 2   Content        8084 non-null   object
 3   Location       5682 non-null   object
 4   Username       8084 non-null   object
 5   Retweet-Count  8084 non-null  

In [3]:
#We don't need the 2 columns called Unnamed, we're gonna drop them. Also, we will remove duplicated tweets. 
stimulus_raw.drop(columns=["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

# Drop duplicated rows
stimulus_raw.drop_duplicates(inplace=True)

# Created at column's type should be datatime
stimulus_raw["Created at"] = pd.to_datetime(stimulus_raw["Created at"])

# Print the info again
print(stimulus_raw.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7696 entries, 0 to 8083
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Content        7696 non-null   object        
 1   Location       5414 non-null   object        
 2   Username       7696 non-null   object        
 3   Retweet-Count  7696 non-null   int64         
 4   Favorites      7696 non-null   int64         
 5   Created at     7696 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 420.9+ KB
None


In [4]:
# Now we clean up the content of the tweets. We'll remove stopwords, punctuation, mention, hashtag, links, and one-or-two letter words lik a or an
#After that, we'll tokenize the tweets
def clean_up(tweet):
    
    # Remove links
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove mentions and hashtag
    tweet = re.sub(r'\@\w+|\#','', tweet)
    
    # Tokenize the words
    tokenized = word_tokenize(tweet)

    # Remove the stop words
    tokenized = [token for token in tokenized if token not in stopwords.words("english")] 

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokenized = [lemmatizer.lemmatize(token, pos='a') for token in tokenized]

    # Remove non-alphabetic characters and keep the words contains three or more letters
    tokenized = [token for token in tokenized if token.isalpha() and len(token)>2]
    
    return tokenized
    
# Call the function and store the result into a new column
stimulus_raw["Processed"] = stimulus_raw["Content"].str.lower().apply(clean_up)


# Print the first fifteen rows of Processed
display(stimulus_raw[["Processed"]].head(15))

Unnamed: 0,Processed
0,"[follow, bidenlies, bidenlied, bidenremorse, b..."
1,"[link, get, trump, fuckbiden, bidenlied, repub..."
2,"[trump, wapo, slam, bidenlied, georgia, electi..."
3,"[youtube, away, dislikes, whitehouse, social, ..."
4,"[everyone, knows, happening, protect, biden, a..."
5,"[kidsincages, good, kidsincages, racist, good,..."
6,"[kidsincages, good, kidsincages, racist, good,..."
7,"[never, seen, anyone, protect, person, like, b..."
8,"[tired, imposed, kids, school, sick, america, ..."
9,"[stopaapihate, abcnews, cnn, hannity, tuckerca..."


In [5]:
#We record the length of the tweets and the number of words in each tweets in new columns
# Get the tweet lengths
stimulus_raw["Length"] = stimulus_raw["Content"].str.len()

# Get the number of words in tweets
stimulus_raw["Words"] = stimulus_raw["Content"].str.split().str.len()

# Display the new columns
display(stimulus_raw[["Length", "Words"]])

Unnamed: 0,Length,Words
0,79,8
1,255,24
2,119,14
3,278,27
4,280,37
...,...,...
8079,104,15
8080,155,22
8081,140,18
8082,52,6


In [6]:
#we want to record the locations of the tweet, so that maybe we can analyze how people from different regions feel about this stimulus check.
#we'll store the location of the tweet to a column. If there's no location found, we'll mark it as 'unknown'
stimulus_raw["Location"].fillna("unknown", inplace=True)

# Print the unique locations and number of unique locations
print("Unique Values:",stimulus_raw["Location"].unique())
print("Unique Value count:",len(stimulus_raw["Location"].unique()))

Unique Values: ['unknown' 'Etats-Unis' 'Hyères, France' ... 'PHL, PA' 'Kansas, USA'
 'Tennessee']
Unique Value count: 2043


In [7]:
# After recording the locations, we want to classify the locations that are states within the US versus foreign countries.
def get_countries(location):
    
    # If location is a country name return its alpha2 code
    if pycountry.countries.get(name= location):
        return pycountry.countries.get(name = location).alpha_2
    
    # If location is a subdivisions name return the countries alpha2 code
    try:
        pycountry.subdivisions.lookup(location)
        return pycountry.subdivisions.lookup(location).country_code
    except:
        # If the location is neither country nor subdivision return the "unknown" tag
        return "unknown"

# Call the function and store the country codes in the Country column
stimulus_raw["Country"] = stimulus_raw["Location"].apply(get_countries)

# Print the unique values
print(stimulus_raw["Country"].unique())

# Print the number of unique values
print("Number of unique values:",len(stimulus_raw["Country"].unique()))


['unknown' 'US' 'GE' 'FR' 'CU' 'CA' 'EG' 'IE' 'HK' 'BS' 'LU' 'AU' 'IL'
 'GB' 'SG' 'MY' 'NL' 'DZ' 'GT' 'NZ' 'AQ' 'ES' 'NG' 'JM' 'DE' 'LR' 'NI'
 'JP' 'UG' 'DK' 'CR' 'UM']
Number of unique values: 32


In [8]:
def get_states(location):
    
    # If location is a US state name return its alpha2 code
    if us.states.lookup(location):
        state = us.states.lookup(location)
        return state.abbr
    
    # If location is not a us state, return not-us or unknown
    try:
        pycountry.subdivisions.lookup(location)
        return "not-us"
    except:
        # If the location is neither country nor subdivision return the "unknown" tag
        return "unknown"

# Call the function and store the country codes in the Country column
stimulus_raw["States"] = stimulus_raw["Location"].apply(get_states)

# Print the unique values
print(stimulus_raw["States"].unique())

# Print the number of unique values
print("Number of unique values:",len(stimulus_raw["States"].unique()))

['unknown' 'GA' 'OK' 'ME' 'CT' 'OH' 'not-us' 'TX' 'WI' 'AR' 'FL' 'MN' 'PA'
 'IA' 'LA' 'NY' 'MI' 'IL' 'OR' 'NJ' 'KY' 'NH' 'SC' 'MA' 'KS' 'WV' 'CA'
 'AL' 'IN' 'CO' 'NV' 'MO' 'VA' 'DE' 'NM' 'SD' 'AZ' 'HI' 'WA' 'MD' 'NC'
 'DC' 'MS' 'VT' 'AK' 'TN']
Number of unique values: 46


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [13]:
from gensim.models import Word2Vec

In [37]:
tfidf_stops = ["stimulus", "check", "stimulus check", "biden", "bidenlied"]


# Initialize a Tf-idf Vectorizer
vectorizer = TfidfVectorizer(max_features=8, stop_words= tfidf_stops)
m=stimulus_raw["Process"].apply(lambda x: ' '.join(x))
tfidf_matrix = vectorizer.fit_transform(m)
display(tfidf_matrix)

KeyError: 'Process'

NameError: name 'tfidf_matrix' is not defined