"""
NLP Sentiment Analysis Model Documentation

Overview:
-----------
This document provides documentation for the NLP Sentiment Analysis model trained using linear regression. The model predicts sentiment labels for text data, indicating whether a given text expresses positive, negative, or neutral sentiment.

Model Details:
--------------
- Model Type:Logistic Regression
- Feature Extraction: TF-IDF (Term Frequency-Inverse Document Frequency)
- Data Cleaning: Removed irrelevant characters, HTML tags, and non-alphabetic characters.
- Preprocessing: Downsampled the dataset, applied stemming, and tokenization.
- Evaluation Metric: Accuracy,Precison,F1-Score

Usage:
------
1. Import the required libraries and modules.
2. Load the pre-trained model and necessary preprocessing components.
3. Input a text string to get the sentiment prediction.



In [195]:
#importing Neccessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,f1_score


### Reading the data

In [2]:


import chardet

rawdata = open('train.csv', 'rb').read()

result = chardet.detect(rawdata)
encoding = result['encoding']

train_data = pd.read_csv('train.csv', encoding=encoding)
rawdata = open('test.csv', 'rb').read()
result = chardet.detect(rawdata)
encoding = result['encoding']
test_data =  pd.read_csv('test.csv', encoding=encoding)



In [4]:
print(train_data.head(10))

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   
5  28b57f3990  http://www.dothebouncy.com/smf - some shameles...   
6  6e0c6d75b1  2am feedings for the baby are fun when he is a...   
7  50e14c0bb8                                         Soooo high   
8  e050245fbd                                        Both of you   
9  fc2cbefa9d   Journey!? Wow... u just became cooler.  hehe....   

                                       selected_text sentiment Time of Tweet  \
0                I`d have responded, if I were going   neutral       morning   
1                                           Sooo SAD  negative          noon   
2          

### Getting The stopwords

In [142]:
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
#extracting the releveant fields
train_data = train_data[['text','sentiment']]
test_data = test_data[['text','sentiment']]
print(train_data.head(5))
print(test_data.head(5))


                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative
                                                text sentiment
0  Last session of the day  http://twitpic.com/67ezh   neutral
1   Shanghai is also really exciting (precisely -...  positive
2  Recession hit Veronique Branquinho, she has to...  negative
3                                        happy bday!  positive
4             http://twitpic.com/4w75p - I like it!!  positive


### Data Cleaning

In [8]:

train_data.isnull().sum()


text         1
sentiment    0
dtype: int64

In [9]:
test_data.isnull().sum()

text         1281
sentiment    1281
dtype: int64

In [10]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [11]:
#checking for ditributions
train_data['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

### Downsampling

In [12]:
def downsample(df, target_column,random_seed=42):
    # Separate the dataset into three classes
    class_1 = df[df[target_column] == "neutral"]
    class_2 = df[df[target_column] == "positive"]
    class_3 = df[df[target_column] == "negative"]

    # Find the minimum number of samples among the three classes
    min_samples = min(len(class_1), len(class_2), len(class_3))

  
    class_1_downsampled = class_1.sample(n=min_samples, random_state=random_seed)
    class_2_downsampled = class_2.sample(n=min_samples, random_state=random_seed)

    # Combine the downsampled classes with the original class 3
    downsampled_dataset = pd.concat([class_1_downsampled, class_2_downsampled, class_3], axis=0)

    return downsampled_dataset

In [13]:
train_data = downsample(train_data,"sentiment")
train_data['sentiment'].value_counts()

neutral     7781
positive    7781
negative    7781
Name: sentiment, dtype: int64

### #Reaplacing Classes with Numerical Values

In [16]:
train_data.replace({'sentiment':{'neutral' : 0}},inplace=True)
train_data.replace({'sentiment':{'positive' : 1}},inplace=True)
train_data.replace({'sentiment':{'negative' : 2}},inplace=True)

train_data['sentiment'].value_counts()

0    7781
1    7781
2    7781
Name: sentiment, dtype: int64

In [157]:


test_data.replace({'sentiment':{'neutral' : 0}},inplace=True)
test_data.replace({'sentiment':{'positive' : 1}},inplace=True)
test_data.replace({'sentiment':{'negative' : 2}},inplace=True)

### Stemming and Tokenization


In [20]:
#stemming
stem = PorterStemmer()
def stem_data(txt):
    stemed_data = re.sub('[^a-zA-Z]',' ',txt)
    stemed_data =stemed_data.lower()
    stemed_data = stemed_data.split()
    stemed_data = [stem.stem(wrd) for wrd in stemed_data if not wrd in stopwords.words('english')]
    stemed_data = ' '.join(stemed_data)
    return stemed_data

In [127]:
train_data["stemmed_data"] = train_data['text'].apply(stem_data)
test_data["stemmed_data"] = test_data['text'].apply(stem_data)

In [158]:
train_data.head(5)


Unnamed: 0,text,sentiment,stemmed_data
4634,i have to go to the doctor... i don`t want to....,0,go doctor want caus wait sooo long
8316,Can`t sleep but is happy that the Fugees are k...,0,sleep happi fuge keep compani
12622,Job??????? Not so much.,0,job much
21585,Nope wasn`t kidding at all. Sometimes I thin...,0,nope kid sometim think forest gump run year
18694,_from_hell how`s monday for you?,0,hell monday


In [159]:
test_data.head(5)

Unnamed: 0,text,sentiment,stemmed_data
0,Last session of the day http://twitpic.com/67ezh,0,last session day http twitpic com ezh
1,Shanghai is also really exciting (precisely -...,1,shanghai also realli excit precis skyscrap gal...
2,"Recession hit Veronique Branquinho, she has to...",2,recess hit veroniqu branquinho quit compani shame
3,happy bday!,1,happi bday
4,http://twitpic.com/4w75p - I like it!!,1,http twitpic com w p like


In [160]:
X_train = train_data['stemmed_data']
X_test = test_data['stemmed_data']
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']


In [149]:
X_train

4634                    go doctor want caus wait sooo long
8316                         sleep happi fuge keep compani
12622                                             job much
21585          nope kid sometim think forest gump run year
18694                                          hell monday
                               ...                        
27464    rec game tri cri pain much need lose heart bre...
27470      lol know haha fall asleep get bore shaun p joke
27472             http twitpic com vr want visit anim late
27476    wish could come see u denver husband lost job ...
27477    wonder rake client made clear net forc dev lea...
Name: stemmed_data, Length: 23343, dtype: object

In [150]:
X_test

0                   last session day http twitpic com ezh
1       shanghai also realli excit precis skyscrap gal...
2       recess hit veroniqu branquinho quit compani shame
3                                              happi bday
4                               http twitpic com w p like
                              ...                        
3529                                    im tire sleep tri
3530    alon old hous thank net keep aliv kick whoever...
3531    know mean littl dog sink depress want move som...
3532             sutra next youtub video gonna love video
3533          http twitpic com woj omgssh ang cute ng bbi
Name: stemmed_data, Length: 3534, dtype: object

In [161]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test.apply(lambda x: np.str_(x)))

In [162]:
print(X_train)

  (0, 8914)	0.36817848539472803
  (0, 14019)	0.43344927711716036
  (0, 16506)	0.3391339620257018
  (0, 2448)	0.41215330696670416
  (0, 16548)	0.2914133609251895
  (0, 4132)	0.5012938309581979
  (0, 6042)	0.2355140813916517
  (1, 3062)	0.47654798689629824
  (1, 8151)	0.3678335109095725
  (1, 5653)	0.672527679392639
  (1, 6542)	0.278479343421971
  (1, 13779)	0.3282625476277222
  (2, 10055)	0.6227872810344516
  (2, 7898)	0.7823912081444391
  (3, 17366)	0.27414336079123575
  (3, 12925)	0.30121303976938374
  (3, 6363)	0.5098384353251276
  (3, 5455)	0.45579911304304593
  (3, 15187)	0.2191606644531618
  (3, 13994)	0.32598728495138685
  (3, 8223)	0.29522323834850395
  (3, 10554)	0.3535140641712389
  (4, 9881)	0.6685743798122914
  (4, 6757)	0.7436452774398622
  (5, 16244)	0.47586399283826103
  :	:
  (23340, 3014)	0.245180619435804
  (23340, 7099)	0.22714398385317702
  (23340, 16548)	0.23939802939328295
  (23341, 225)	0.4191160775105941
  (23341, 7171)	0.40963822396374144
  (23341, 3836)	0.46062

In [163]:
print(X_test)

  (0, 15819)	0.4216152924489614
  (0, 13340)	0.6141841803949164
  (0, 8528)	0.3817007200566433
  (0, 7099)	0.3230056748580287
  (0, 3656)	0.27098043571911073
  (0, 3014)	0.34865432092694937
  (1, 15760)	0.33308179973035895
  (1, 13364)	0.3605047323733574
  (1, 12371)	0.17563393729028123
  (1, 11870)	0.39402631740697847
  (1, 6085)	0.15282795271473062
  (1, 5760)	0.39402631740697847
  (1, 4922)	0.2333832991536303
  (1, 2704)	0.34262287704498345
  (1, 1569)	0.4100677046629347
  (1, 458)	0.2331565768773938
  (2, 13384)	0.43850667403746013
  (2, 12403)	0.5622442625387076
  (2, 12204)	0.38641173018013353
  (2, 6882)	0.38894424862326826
  (2, 3062)	0.4370373351431049
  (3, 6542)	0.5093419220339135
  (3, 1280)	0.8605642372645976
  (4, 15819)	0.5959532911969049
  (4, 8748)	0.43989811574930365
  :	:
  (3530, 6463)	0.2184963125053067
  (3530, 440)	0.2359478846573929
  (3530, 402)	0.29787899521976496
  (3531, 16548)	0.18864610314425578
  (3531, 15641)	0.4499987307588632
  (3531, 13988)	0.44999873

### Training Model


In [136]:
NLPmodel = LogisticRegression(max_iter=900)
NLPmodel.fit(X_train,Y_train)

## Model Evaluation


In [193]:
#training  accuracy
train_prediction = NLPmodel.predict(X_train)

print(f"Training  Accuracy : {accuracy_score(Y_train,train_prediction)*100}")
print(f"Training Precision : {precision_score(Y_train,train_prediction,average='weighted')*100}")
print(f"Training F1 Score : {f1_score(Y_train,train_prediction,average='weighted')*100}")

Training Accuracy = 
Training  Accuracy : 82.50867497750932
Training Precision : 82.65964050126286
Training F1 Score : 82.55765571597043


In [194]:
#testing accuracy


test_prediction = NLPmodel.predict(X_test)

print(f"Testing  Accuracy : {accuracy_score(Y_test,test_prediction)*100}")
print(f"Testing Precision : {precision_score(Y_test,test_prediction,average='weighted')*100}")
print(f"Testing F1 Score : {f1_score(Y_test,test_prediction,average='weighted')*100}")


Testing  Accuracy : 71.1092246745897
Testing Precision : 71.2650000694683
Testing F1 Score : 71.09874204086508


In [82]:
import pickle   
model = "NLPModel.sav"
pickle.dump(NLPmodel,open(model,'wb'))

### Deployment and Testing of Model

In [187]:
#deployment of the model
classes = ["neutral","positive","negative"]
txt = "Today is a bad day"
#load the pretrained model
SentimentModel = pickle.load(open("NLPModel.sav",'rb'))
txt = stem_data(txt)
vectorized_text = vectorizer.transform([txt])
sentimentscore= SentimentModel.predict(vectorized_text)
if(sentimentscore == 0):
    print("Neutral")
elif(sentimentscore == 1):
    print("Positive")
else:
    print("Negative")

Negative
