# Sentiment Analysis 
This is a sentiment analysis program that parses the tweets fetched from twitter using python. for this particular program we'll use tweets from Safaricom

# import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
from textblob import TextBlob
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Read the csv file using pandas

In [None]:
#we can see that we have 114867 rows and 2 columns
df = pd.read_csv('safaricom_tweets 2.csv' )
df

# Data cleaning

In [None]:
#create a function to clean tweets 
def cleanTxt(text):
    text=re.sub(r'@[A-Za-z0-9]+','',text)# removed @mentions
    text=re.sub(r'#','',text)#removing '#' symbol
    text=re.sub(r'RT[\s]+','',text)#removing RT
    text=re.sub(r'https?\/\/\s+','',text)#removing the hyper link
    return text
df['tweet']=df['tweet'].apply(cleanTxt) #apply fuction to clean the tweets
df.head()

# Get the Subjectivity and the polarity

In [None]:
#create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

#create two new columns call subjectivity and polarity
df['subjectivity']=df['tweet'].apply(getSubjectivity)
df['polarity']=df['tweet'].apply(getPolarity)

#show dataframe with new columns
df.head()

In [None]:
# visualize the frequent words
all_words = " ".join([sentence for sentence in df['tweet']])

from wordcloud import WordCloud
wordcloud = WordCloud(width=700, height=400, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# sentiment analysis

In [None]:
#create a function to compute the negative,positive and neutral analysis
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score==0:
        return 'Neutral'
    else:
        return 'Positive'
df['Analysis']=df['polarity'].apply(getAnalysis)

df

In [None]:
#print all of the positive tweets
j=1
sortedDF=df.sort_values(by=['polarity'])
for i in range(0, sortedDF.shape[0]):
    if(sortedDF['Analysis'][i]=='Positive'):
        print(str(j)+')'+sortedDF['tweet'][i])
        print()
        j=j+1

In [None]:
#print all of the negative tweets
j=1
sortedDF=df.sort_values(by=['polarity'])
for i in range(0, sortedDF.shape[0]):
    if(sortedDF['Analysis'][i]=='Negative'):
        print(str(j)+')'+sortedDF['tweet'][i])
        print()
        j=j+1

In [None]:
#print all of the neutral tweets
j=1
sortedDF=df.sort_values(by=['polarity'])
for i in range(0, sortedDF.shape[0]):
    if(sortedDF['Analysis'][i]=='Neutral'):
        print(str(j)+')'+sortedDF['tweet'][i])
        print()
        j=j+1

In [None]:
#plot polarity and subjectivity
tweet_polarity=df['polarity']
tweet_subjectivity=df['subjectivity']
sns.scatterplot(tweet_polarity, # X-axis
                tweet_subjectivity,  # Y-axis
                s=10);


plt.title("Sentiment Analysis", fontsize = 20)
plt.xlabel('polarity', fontsize=15)
plt.ylabel('subjectivity', fontsize=15)
plt.tight_layout()

In [None]:
#Get the percentage of positive tweets
ptweets=df[df.Analysis=='Positive']
ptweets=ptweets['tweet']

round((ptweets.shape[0]/df.shape[0])*100,1)

In [None]:
#Get the percentage of negative tweets
ptweets=df[df.Analysis=='Negative']
ptweets=ptweets['tweet']

round((ptweets.shape[0]/df.shape[0])*100,1)

In [None]:
#Get the percentage of neutral tweets
ptweets=df[df.Analysis=='Neutral']
ptweets=ptweets['tweet']

round((ptweets.shape[0]/df.shape[0])*100,1)

In [None]:
#show the value counts
df['Analysis'].value_counts()

#plot and visualize the counts
plt.title('sentiment analysis')
plt.xlabel('sentiment')
plt.ylabel('counts')
df['Analysis'].value_counts().plot(kind='bar')

# Machine Learning

# Support Vector Machine

In [None]:
#Import the libraries of Support Vector Machine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
df.head()

In [None]:
tfidf=TfidfVectorizer(max_features=5000)
x=df['tweet']
y=df['Analysis']

x=tfidf.fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2, random_state=0)

In [None]:
#training of the machine
clf=LinearSVC()
clf.fit(x_train, y_train )

In [None]:
# performance of the algorithm
y_pred=clf.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
#prediction with random sentence
x="i am happy today !"
vec=tfidf.transform([x])
clf.predict(vec)

# Naïve Bayes algorithms

In [None]:
from sklearn.naive_bayes import MultinomialNB
model_naive=MultinomialNB().fit(x_train, y_train)
predicted_naive = model_naive.predict(x_test)

In [None]:
# performance of the algorithm
print(classification_report(y_test, predicted_naive))

# Logistic Regression algorithm

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_reg=LogisticRegression().fit(x_train, y_train)
predicted_logistic_reg=logistic_reg.predict(x_test)

In [None]:
# performance of the algorithm
print(classification_report(y_test, predicted_logistic_reg))