<a href="https://colab.research.google.com/github/Mahathi-Dundigal/Data-Science-project-series/blob/main/Sentiment_Analysis_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
data = pd.read_csv('/content/test.csv', encoding='latin-1')

In [7]:
data.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            3534 non-null   object 
 1   text              3534 non-null   object 
 2   sentiment         3534 non-null   object 
 3   Time of Tweet     3534 non-null   object 
 4   Age of User       3534 non-null   object 
 5   Country           3534 non-null   object 
 6   Population -2020  3534 non-null   float64
 7   Land Area (Km²)   3534 non-null   float64
 8   Density (P/Km²)   3534 non-null   float64
dtypes: float64(3), object(6)
memory usage: 338.7+ KB


In [9]:
data.describe()

Unnamed: 0,Population -2020,Land Area (Km²),Density (P/Km²)
count,3534.0,3534.0,3534.0
mean,39418910.0,672249.9,348.894171
std,146875700.0,1839134.0,1967.012367
min,801.0,0.0,2.0
25%,1968001.0,22810.0,35.0
50%,8696453.0,112760.0,87.0
75%,28435940.0,527970.0,214.0
max,1439324000.0,16376870.0,26337.0


In [10]:
data.tail()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
4810,,,,,,,,,
4811,,,,,,,,,
4812,,,,,,,,,
4813,,,,,,,,,
4814,,,,,,,,,


In [11]:
data.dropna(inplace=True)

In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [13]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    words = word_tokenize(text)
    # Removing stop words and special characters
    words = [word for word in words if word.isalnum() and word not in stop_words]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [14]:
data['clean_text'] = data['text'].apply(preprocess_text)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['sentiment'], test_size=0.2, random_state=42)


In [16]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [17]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions = nb_classifier.predict(X_test_tfidf)

In [18]:
print("Naive Bayes Model Performance:")
print("Accuracy:", accuracy_score(y_test, nb_predictions))
print("Precision:", precision_score(y_test, nb_predictions, average='weighted'))
print("Recall:", recall_score(y_test, nb_predictions, average='weighted'))
print("F1 Score:", f1_score(y_test, nb_predictions, average='weighted'))
print(classification_report(y_test, nb_predictions))

Naive Bayes Model Performance:
Accuracy: 0.5968882602545968
Precision: 0.6430956536739751
Recall: 0.5968882602545968
F1 Score: 0.5876438177289545
              precision    recall  f1-score   support

    negative       0.78      0.37      0.50       207
     neutral       0.52      0.78      0.62       286
    positive       0.68      0.57      0.62       214

    accuracy                           0.60       707
   macro avg       0.66      0.57      0.58       707
weighted avg       0.64      0.60      0.59       707



In [19]:
cv_scores = cross_val_score(nb_classifier, X_train_tfidf, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


Cross-Validation Scores: [0.58303887 0.5565371  0.58230088 0.55929204 0.58230088]
Mean CV Accuracy: 0.5726939554082368


In [20]:

def preprocess_single_text(text):
    cleaned_text = preprocess_text(text)
    return [cleaned_text]

example_text = "I really enjoyed the movie, it was fantastic!"

preprocessed_example_text = preprocess_single_text(example_text)

vectorized_example_text = tfidf_vectorizer.transform(preprocessed_example_text)

predicted_sentiment = nb_classifier.predict(vectorized_example_text)

print("Predicted Sentiment:", predicted_sentiment[0])


Predicted Sentiment: positive
