<a href="https://colab.research.google.com/github/Joykareko/Data-Science-Projects/blob/main/Kenyas_Politics_Sentiment_Analysis_ML_Project_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
from textblob import TextBlob
from wordcloud import WordCloud

In [3]:
#loading the data
election_data = pd.read_csv('/content/kenya_political_tweets.csv')
election_data.head(3)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1396091006030856192,Doomster,Utopia slums,"Alt-left, liberal, pronoun:them",2009-08-23 12:22:53,309,907,8050,False,2021-05-22 13:10:08,@PrettyAmmina The change to the new blue unifo...,,Twitter Web App,0.0,0.0,False
1,1395981137764397056,Unruly 🗯,Morayfield,I don't know what you thought this was gonna b...,2011-04-20 19:35:54,1053,523,5279,False,2021-05-22 05:53:33,I think it's now safe to say President Uhuru K...,,Twitter for Android,0.0,0.0,False
2,1395433210490798082,Khavin,Nairobi,CCTV installation and MAINTENANCE technician. ...,2020-02-11 04:52:44,4202,3735,29563,False,2021-05-20 17:36:17,@_CrazyKenyan President Uhuru Kenyatta bribed ...,['bbinonsense'],Twitter Web App,0.0,1.0,False


In [4]:
#checking the rows and columns
election_data.shape

(11723, 16)

In [5]:
#checking the general info
election_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11723 entries, 0 to 11722
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                11723 non-null  object 
 1   user_name         11723 non-null  object 
 2   user_location     9239 non-null   object 
 3   user_description  10730 non-null  object 
 4   user_created      11722 non-null  object 
 5   user_followers    11722 non-null  object 
 6   user_friends      11722 non-null  object 
 7   user_favourites   11722 non-null  object 
 8   user_verified     11721 non-null  object 
 9   date              11722 non-null  object 
 10  text              11722 non-null  object 
 11  hashtags          1174 non-null   object 
 12  source            11722 non-null  object 
 13  retweets          11721 non-null  float64
 14  favorites         11721 non-null  float64
 15  is_retweet        11721 non-null  object 
dtypes: float64(2), object(14)
memory usage: 

In [6]:
#missing values?
election_data.isnull().sum()

id                      0
user_name               0
user_location        2484
user_description      993
user_created            1
user_followers          1
user_friends            1
user_favourites         1
user_verified           2
date                    1
text                    1
hashtags            10549
source                  1
retweets                2
favorites               2
is_retweet              2
dtype: int64

In [7]:
#interested in user loaction and text
election_data['user_location'].value_counts()

Nairobi, Kenya                    2954
Kenya                             1189
Nairobi                            558
Limuru, Kenya                      186
Mombasa, Kenya                     173
                                  ... 
South Carolina, USA                  1
Nakuru,kenya                         1
PFM Act-2012,Section 155(3)(b)       1
Khachonge, Bungoma County            1
Ndenderù                             1
Name: user_location, Length: 986, dtype: int64

In [8]:
#stopwords do not add value to the text
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [9]:
#replacing the null values in 'text' column with empty strings
#replacing the null values in 'location' column with mode
election_data.fillna(value = {'text':' ','user_location':'Nairobi'},inplace = True)
election_data.isnull().sum()

id                      0
user_name               0
user_location           0
user_description      993
user_created            1
user_followers          1
user_friends            1
user_favourites         1
user_verified           2
date                    1
text                    0
hashtags            10549
source                  1
retweets                2
favorites               2
is_retweet              2
dtype: int64

Cleaning the Data

In [10]:
#cleaning the text column which will be our main analysis
election_data = election_data['text']

In [11]:
#cleaning the data step 1 by removing tags ,@, RT
def CleanTxt(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #removes @mentions and substitutes with an empty string
    text = re.sub(r'#','', text) #removes #tags
    text = re.sub(r'RT[\s]+','',text) #removes the RT
    text = re.sub(r'https?:\/\/\S+','',text)
    
    return text

election_data = election_data.apply(CleanTxt)
election_data

0         The change to the new blue uniforms was order...
1        I think it's now safe to say President Uhuru K...
2        @_CrazyKenyan President Uhuru Kenyatta bribed ...
3        As _Kuria has just been saying,President Uhuru...
4        David Murathe the KEMSA thief and President Uh...
                               ...                        
11718    Preparations are in top gear for the coronatio...
11719    "I managed to kick William Ruto out of Jubilee...
11720    If William Ruto and Uhuru Kenyatta could not m...
11721    DP William Ruto has criticized proponents of B...
11722    In the early 90s, companies used to look for g...
Name: text, Length: 11723, dtype: object

In [12]:
#stemming procedure
#stemming removes the root words(prefixes/suffixes)
port_stem = PorterStemmer()

In [13]:
#create a stemming function
def stemming(text):
    stemmed_content = re.sub('[^a-zA-Z]',' ',text)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
election_data = election_data.apply(stemming)
election_data

0        chang new blue uniform order presid uhuru keny...
1        think safe say presid uhuru kenyatta amepatia ...
2        crazykenyan presid uhuru kenyatta bribe mca mp...
3        kuria say presid uhuru kenyatta finish ago mt ...
4        david murath kemsa thief presid uhuru kenyatta...
                               ...                        
11718    prepar top gear coron deputi presid william ru...
11719       manag kick william ruto jubile murath one word
11720    william ruto uhuru kenyatta could maintain sim...
11721    dp william ruto critic propon bbi pursu consti...
11722    earli compani use look graduat even villag gra...
Name: text, Length: 11723, dtype: object

In [16]:
#creating a function that shows subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#creating a function that shows the polarity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

election_data['Subjectivity'] = election_data.apply(getSubjectivity)
election_data['Polarity'] = election_data.apply(getPolarity)

#election_data 

TypeError: ignored