# Import Libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib as plt
import nltk

In [41]:
import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [42]:
df=pd.read_csv(r"D:\ML\DataSets\Sentiment.csv",encoding='latin-1')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [43]:
df=df.drop(columns=["textID","selected_text","Time of Tweet","Age of User","Country","Population -2020","Land Area (Km²)","Density (P/Km²)"])

In [44]:
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [45]:
unique_counts = df['sentiment'].value_counts()
print(unique_counts)


sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


In [46]:
label={'neutral': 0,'negative': -1,'positive': 1}
df["sentiment"]=df["sentiment"].map(label)

In [47]:
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",0
1,Sooo SAD I will miss you here in San Diego!!!,-1
2,my boss is bullying me...,-1
3,what interview! leave me alone,-1
4,"Sons of ****, why couldn`t they put them on t...",-1
...,...,...
27476,wish we could come see u on Denver husband l...,-1
27477,I`ve wondered about rake to. The client has ...,-1
27478,Yay good for both of you. Enjoy the break - y...,1
27479,But it was worth it ****.,1


In [48]:
df.isnull().sum()

text         1
sentiment    0
dtype: int64

In [49]:
df=df.dropna()

In [50]:
df.duplicated().sum()

np.int64(0)

In [51]:
df.shape 

(27480, 2)

# Lemmatizing and Vectorizing

In [52]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [53]:
stopwords=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

In [54]:
def preprocess_text(text):
    #lower Case
    text = text.lower()
    #remove special characters and digits
    cleaned_text = ''.join([char if char.isalpha() or char.isspace() else ' ' for char in text])
    # Tokenize
    tokens = cleaned_text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    # Lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Combine
    return ' '.join(lemmatized_tokens)

df['text'] = df['text'].apply(preprocess_text)

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer()

In [56]:
x=vectorizer.fit_transform(df['text']).toarray()
y=df['sentiment']

# Model Training

In [57]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [59]:
lr=LogisticRegression()
knn=KNeighborsClassifier()


In [60]:
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
    
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")
  

Accuracy: 0.68, Precision: 0.70, Recall: 0.68, F1: 0.68


# Pickling

In [61]:
import pickle

In [62]:
#pickle.dump(lr,open("D:\ML\Projects\Sentiment_Analysis_NLP\lr_model.pkl","wb"))

In [63]:
#pickle.dump(vectorizer,open("D:\ML\Projects\Sentiment_Analysis_NLP\\vectorizer.pkl","wb"))

In [64]:
pickled_model=pickle.load(open("D:\ML\Projects\Sentiment_Analysis_NLP\lr_model.pkl","rb"))

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Fit the vectorizer on the training data
vectorizer = CountVectorizer()
vectorizer.fit(df['text'])  # Fit on your training data column

# Save the vectorizer
pickle.dump(vectorizer, open("D:\\ML\\Projects\\Sentiment_Analysis_NLP\\vectorizer.pkl", "wb"))


In [66]:
import pickle

pickled_model = pickle.load(open("D:\\ML\\Projects\\Sentiment_Analysis_NLP\\lr_model.pkl", "rb"))
vectorizer = pickle.load(open("D:\\ML\\Projects\\Sentiment_Analysis_NLP\\vectorizer.pkl", "rb"))

new_input = ["I`m in VA for the weekend, my youngest son turns 2 tomorrow......it makes me kinda sad, he is getting so big, check out my twipics"]

new_input_vectorized = vectorizer.transform(new_input)

prediction = pickled_model.predict(new_input_vectorized)
print("Prediction:", prediction)


Prediction: [-1]


In [67]:

if prediction == -1:
    result = "negative"
elif prediction == 0:
    result = "neutral"
elif prediction == 1:
    result = "positive"
else:
    result = "unknown"

print(result)


negative
