In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import joblib
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
dataset = pd.read_csv('Twitter_Data.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [None]:
dataset.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
dataset.shape

(162980, 2)

In [None]:
dataset.isna().sum()

clean_text    4
category      7
dtype: int64

In [None]:
dataset = dataset.dropna()

In [None]:
dataset.isna().sum()

clean_text    0
category      0
dtype: int64

In [None]:
print(dataset['clean_text'][0])

when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples


In [None]:
blob = TextBlob(dataset['clean_text'][0])

In [None]:
for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

-0.3


# Step 0: Counting entries of each category

In [None]:
count = dataset['category'].value_counts()
count

 1.0    72249
 0.0    55211
-1.0    35509
Name: category, dtype: int64

## Undersampling to balance the dataset

In [None]:
min_count = 30000

# Undersample each class to the size of the smallest class
df_neutral = dataset[dataset['category'] == 0].sample(min_count, random_state=42)
df_positive = dataset[dataset['category'] == 1].sample(min_count, random_state=42)
df_negative = dataset[dataset['category'] == -1].sample(min_count, random_state=42)

# Combine the undersampled data
df_balanced = pd.concat([df_neutral, df_positive, df_negative])

# Shuffle the DataFrame
dataset = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
dataset.shape

(90000, 2)

# Step 1: Lower Text

In [None]:
def to_lower(data):
    clean_text_1 = []
    for sentence in data:
        clean_text_1.append(str.lower(sentence))
    return clean_text_1

In [None]:
clean_text_1 = to_lower(dataset['clean_text'])
clean_text_1[:5]

['modi will due day but before that will make india bankrupt',
 'talks about the another tweet saying too close call bjp mgb bjp con concludes modi name prevails bjp wil otherwise ',
 'thanks for information  thanks this series will see the true stoties and india',
 'watch they used want send each other jail modi takes spbsp alliance harkens back their past ',
 'modi waiting for min 50k likes giis tweet its already past due time ']

# Step 2: Tokenize

In [None]:
def tokenize_sentence(data):
    clean_text_2 = [word_tokenize(i) for i in data]
    return clean_text_2

In [None]:
clean_text_2 = tokenize_sentence(clean_text_1)

In [None]:
print(clean_text_2[:1])

[['modi', 'will', 'due', 'day', 'but', 'before', 'that', 'will', 'make', 'india', 'bankrupt']]


# Step 3 : punctuation removal

In [None]:
def remove_punctuation(text):
    clean_text_3 = []

    for words in text:
        clean = []
        for w in words:
            res = re.sub(r'[^\w\s]', "", w)
            if res != "":
                clean.append(res)
        clean_text_3.append(clean)
    return clean_text_3

In [None]:
clean_text_3 = remove_punctuation(clean_text_2)
print(clean_text_3[4:5])

[['modi', 'waiting', 'for', 'min', '50k', 'likes', 'giis', 'tweet', 'its', 'already', 'past', 'due', 'time']]


# Step 4: stop word removal

In [None]:
def remove_stopwords(text):
    clean_text_4 = []

    for words in text:
        clean = []
        for word in words:
            if not word in stopwords.words('english'):
                clean.append(word)
        clean_text_4.append(clean)
    return clean_text_4

In [None]:
clean_text_4 = remove_stopwords(clean_text_3)
print(len(clean_text_3[0]))
print(len(clean_text_4[0]))
print(clean_text_4[:1])

11
6
[['modi', 'due', 'day', 'make', 'india', 'bankrupt']]


# Stage 5: Stemming Or Lemitizing


In [None]:
def stemmer_sentence(text):
    port = PorterStemmer()
    clean_text_5 = []

    for words in text:
        clean = [port.stem(word) for word in words]
        clean_text_5.append(clean)
    return clean_text_5

In [None]:
clean_text_5 = stemmer_sentence(clean_text_4)
print(clean_text_5[:1])

[['modi', 'due', 'day', 'make', 'india', 'bankrupt']]


In [None]:
def sentence_lematizer(text):
    wnet = WordNetLemmatizer()
    clean_text_6 = []

    for words in text:
        clean = [wnet.lemmatize(word) for word in words]
        clean_text_6.append(clean)
    return clean_text_6

In [None]:
clean_text_6 = sentence_lematizer(clean_text_4)

In [None]:
print(clean_text_6[4:5])

[['modi', 'waiting', 'min', '50k', 'like', 'giis', 'tweet', 'already', 'past', 'due', 'time']]


# Stage 6: Vectorization

In [None]:
def vectorize(text):
    clean_text_6_as_strings = [' '.join(words) for words in text]

    # Convert text data into TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
    X = vectorizer.fit_transform(clean_text_6_as_strings)
    
    joblib.dump(vectorizer, 'vectorizer.pkl')
    
    return X

In [None]:
X = vectorize(clean_text_6)
# Convert the target variable to numerical values
y = dataset['category']

# Stage 7: Split Dataset  

In [None]:
def dataset_split(X, y, size):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = dataset_split(X, y, 0.2)

# Stage 8: Model Training and Prediction

In [None]:
def logistic_model(X_train, X_test, y_train, y_test, itera):
    
    model = LogisticRegression(max_iter=itera)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    # Print evaluation metrics
    print("Accuracy:", accuracy_score(y_test, y_pred)*100)
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))
    return model

In [None]:
lr_model = logistic_model(X_train, X_test, y_train, y_test, 3000)

Accuracy: 86.51666666666667
Classification Report:
               precision    recall  f1-score   support

    negative       0.87      0.83      0.85      6093
     neutral       0.83      0.95      0.89      5935
    positive       0.90      0.81      0.86      5972

    accuracy                           0.87     18000
   macro avg       0.87      0.87      0.86     18000
weighted avg       0.87      0.87      0.86     18000



In [None]:
def naive_bayes_model(X_train, X_test, y_train, y_test):
    
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    # Print evaluation metrics
    print("Accuracy:", accuracy_score(y_test, y_pred)*100)
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))
    return model

In [None]:
nb_model = naive_bayes_model(X_train, X_test, y_train, y_test)

Accuracy: 76.06111111111112
Classification Report:
               precision    recall  f1-score   support

    negative       0.71      0.80      0.75      6093
     neutral       0.80      0.75      0.77      5935
    positive       0.78      0.73      0.75      5972

    accuracy                           0.76     18000
   macro avg       0.76      0.76      0.76     18000
weighted avg       0.76      0.76      0.76     18000



In [None]:
def support_vector_model(X_train, X_test, y_train, y_test):
    
    model = SVC(kernel='linear')
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    # Print evaluation metrics
    print("Accuracy:", accuracy_score(y_test, y_pred)*100)
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))
    return model

In [None]:
svm_model = support_vector_model(X_train, X_test, y_train, y_test)

Accuracy: 87.82777777777778
Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.85      0.86      6093
     neutral       0.85      0.96      0.90      5935
    positive       0.91      0.83      0.86      5972

    accuracy                           0.88     18000
   macro avg       0.88      0.88      0.88     18000
weighted avg       0.88      0.88      0.88     18000



# Save model

In [None]:
# Save the trained model
# joblib.dump(lr_model, 'sentiment_analysis_lr_model.pkl')
# joblib.dump(nb_model, 'sentiment_analysis_nb_model.pkl')
joblib.dump(svm_model, 'sentiment_analysis_svm_model.pkl')

['sentiment_analysis_svm_model.pkl']

In [None]:
def predict(text):
    
    model = joblib.load('sentiment_analysis_svm_model.pkl')
    vectorizer = joblib.load('vectorizer.pkl')

    X = vectorizer.transform([text])
    y_pred = model.predict(X)[0]
    return y_pred

In [None]:
input_text = "Not me giggling everytime he says 'balls' Anyways, great vid man!!"

# Print the prediction
print("Predicted sentiment:", predict(input_text))

Predicted sentiment: 1.0
