# Sentiment Analysis project which is able to classify positive,negative and neutral reviews

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("master_twitter_sentiment.csv")

In [3]:
df.head()

Unnamed: 0,no,num,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df = df[["sentiment","text"]]

In [5]:
df.isnull().sum()

sentiment      0
text         686
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df["sentiment"].value_counts()

Negative      22624
Positive      20932
Neutral       18393
Irrelevant    13047
Name: sentiment, dtype: int64

In [8]:
df.drop(df[df["sentiment"]=="Irrelevant"].index,axis=0,inplace=True)

In [9]:
df = df[df["text"].apply(len)>5]

In [10]:
import re

In [11]:
def datacleaning(text):
    cleaned_text = re.sub(r"[^\w\s]|[\d]"," ",text)
    cleaned_text = re.sub(r"[_]"," ",cleaned_text)
    return cleaned_text

In [12]:
df["text"] = df["text"].apply(datacleaning)

In [13]:
df["text"] = df["text"].str.lower()

In [14]:
import nltk

In [15]:
from nltk.corpus import stopwords

In [16]:
nltk_stopwords = stopwords.words("english")

In [17]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [18]:
sklearn_stopwords = ENGLISH_STOP_WORDS

In [19]:
combined_stopwords = list(set(nltk_stopwords).union(sklearn_stopwords))

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
lemm = WordNetLemmatizer()

In [22]:
def process_data(text):
    words = nltk.word_tokenize(text)
    lemmatized = [lemm.lemmatize(word) for word in words if word not in combined_stopwords ]
    return " ".join(lemmatized)
    

In [23]:
df["text"] = df["text"].apply(process_data)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf = TfidfVectorizer(max_features=10000)

In [26]:
x = tfidf.fit_transform(df["text"])

In [27]:
y = df["sentiment"]

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier()

In [32]:
rf.fit(x_train,y_train)

In [33]:
pred = rf.predict(x_test)

In [34]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [35]:
confusion_matrix(y_test,pred)

array([[4103,   74,  177],
       [ 131, 3327,  217],
       [ 144,  108, 3740]], dtype=int64)

In [36]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

    Negative       0.94      0.94      0.94      4354
     Neutral       0.95      0.91      0.93      3675
    Positive       0.90      0.94      0.92      3992

    accuracy                           0.93     12021
   macro avg       0.93      0.93      0.93     12021
weighted avg       0.93      0.93      0.93     12021



In [38]:
import pickle

In [39]:
pickle.dump(rf,open("SentimentAnalysis.pkl","wb"))