In [99]:
import numpy as np
import sklearn.datasets as ds
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import re
import string
from sklearn.metrics import confusion_matrix,classification_report

In [100]:
data = pd.read_csv('sentiment_data NEW .csv')

In [101]:
data.keys()

Index(['Unnamed: 0', 'Comment', 'Sentiment'], dtype='object')

In [102]:
data

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,lets forget apple pay required brand new iphon...,1
1,1,nz retailers don’t even contactless credit car...,0
2,2,forever acknowledge channel help lessons ideas...,2
3,3,whenever go place doesn’t take apple pay doesn...,0
4,4,apple pay convenient secure easy use used kore...,2
...,...,...,...
241140,241921,crores paid neerav modi recovered congress lea...,0
241141,241922,dear rss terrorist payal gawar modi killing pl...,0
241142,241923,cover interaction forum left,1
241143,241924,big project came india modi dream project happ...,1


In [103]:
data.describe(include='all')

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
count,241145.0,240928,241145.0
unique,,214157,
top,,modi,
freq,,248,
mean,121289.252479,,1.198822
std,69709.762092,,0.78511
min,0.0,,0.0
25%,61063.0,,1.0
50%,121350.0,,1.0
75%,181636.0,,2.0


In [104]:
X=data.Comment
y=data.Sentiment

In [105]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=32)

In [106]:
X_train.shape

(192916,)

In [107]:
def remove_missing_values(df):
    return data.dropna()

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_emojis(text):
    emoji_pattern = re.compile("[\U00010000-\U0010FFFF]", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_emojis(text):
    emoji_pattern = re.compile("[\U00010000-\U0010FFFF]", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_special_chars(text):
    allowed_chars = set(string.ascii_letters + "áéíóúãõàâêôç ")
    return ''.join(c for c in text if c in allowed_chars)

In [108]:
def clean_text(text):
    if not isinstance(text, str):
        return '' 
    
    text = text.lower().strip() 
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_emojis(text)
    text = remove_special_chars(text)
    text = re.sub(r'\s+', ' ', text) 
    
    return text
data.Comment= data['Comment'].apply(clean_text)

In [109]:
text_cleaner = FunctionTransformer(lambda x: x.apply(clean_text))  
pipeline=Pipeline([
    ('cleaner' , text_cleaner),
    ('tfidf' , TfidfVectorizer(stop_words='english' , max_features=10000)),
    ('lr' , LogisticRegression(max_iter=1000))
])
pipeline.fit(X_train,y_train)

In [110]:
y_pred = pipeline.predict(X_test)


In [111]:
result_data=pd.DataFrame({'Predicted':y_pred, 'Actual':y_test})

In [112]:
print(result_data)

        Predicted  Actual
7090            2       2
76564           1       2
156258          0       2
169897          1       1
50526           2       2
...           ...     ...
28560           2       2
225298          1       1
131562          2       2
198110          1       1
64235           2       1

[48229 rows x 2 columns]


In [113]:
accuracy=sum(y_pred==y_test)/len(y_pred)
print(f"{accuracy=:.2f}")

accuracy=0.77


In [114]:
print ("\n Classification Report:\n",classification_report(y_test,y_pred))


 Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.65      0.71     10917
           1       0.72      0.82      0.77     16692
           2       0.82      0.80      0.81     20620

    accuracy                           0.77     48229
   macro avg       0.77      0.76      0.76     48229
weighted avg       0.78      0.77      0.77     48229

