In [1]:
import pandas as pd

# Load data from CSV file into a pandas DataFrame
df = pd.read_csv('proper tweet data.csv')
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [2]:
# Create a new DataFrame with 'text' and 'sentiment' columns
data = {'text': df['selected_text'], 'sentiment': df['sentiment']}

df = pd.DataFrame(data)
# Keep the first 25,000 rows in the DataFrame
df = df.head(25000)
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD,negative
2,bullying me,negative
3,leave me alone,negative
4,"Sons of ****,",negative
...,...,...
24995,i love,positive
24996,ou deserve so much bette,positive
24997,miss,negative
24998,its cuz you love me. dont make me sad,neutral


In [3]:
# Display basic information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       24999 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [4]:
# Drop rows with missing values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24999 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       24999 non-null  object
 1   sentiment  24999 non-null  object
dtypes: object(2)
memory usage: 585.9+ KB


In [5]:
# Display unique values in the 'sentiment' column
df['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [6]:
# Filter rows where 'text' contains only letters and whitespace
df1 = df[df['text'].str.contains(r'^[a-zA-Z\s]*$')]
print(df1.head())

             text sentiment
1        Sooo SAD  negative
2     bullying me  negative
3  leave me alone  negative
6             fun  positive
7      Soooo high   neutral


In [7]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = df1['text']
y = df1['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Tokenization, stemming, and stop word removal using NLTK

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\appu6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
tokenizer = RegexpTokenizer(r"\w+")
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [10]:
def getCleanedText(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]
    clean_text = " ".join(stemmed_tokens)
    return clean_text

In [11]:
# Apply text cleaning to training and testing sets
X_clean = [getCleanedText(i) for i in X_train]
xt_clean = [getCleanedText(i) for i in X_test]

In [12]:
# Display cleaned text for the first 10 samples
X_clean[:10]

['go southland',
 'done dentist novemb',
 'unfortun',
 'reject',
 'love hairsss',
 'hurt',
 'tire',
 'delici',
 'sweeet',
 'one']

In [13]:
# Convert text data into a bag-of-words representation using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range = (1,2))
X_vec = cv.fit_transform(X_clean).toarray()

X_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
# Display feature names from the CountVectorizer
print(cv.get_feature_names_out())

['aaaaaaaaaaa' 'aaaaaaaaaaa mcfli' 'aaaaaaaaaamaz' ... 'zion tweet' 'zoo'
 'zoo today']


In [15]:
# Transform the test set into the same feature space
Xt_vect = cv.transform(xt_clean).toarray()

In [16]:
Xt_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
# Perform hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
grid = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid.fit(X_vec, y_train)

In [18]:
best_alpha = grid.best_params_['alpha']

# Build the Naive Bayes model with the best alpha
mn = MultinomialNB(alpha=best_alpha)
mn.fit(X_vec, y_train)

In [19]:
# Make predictions on the test set
y_pred = mn.predict(Xt_vect)

In [20]:
y_pred

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [21]:
# Evaluating the model's accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(accuracy*100)

83.08635666836996


In [22]:
# Displaying classification report with precision, recall, and F1-score
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.87      0.81      0.84       730
     neutral       0.78      0.54      0.64       335
    positive       0.81      0.96      0.88       892

    accuracy                           0.83      1957
   macro avg       0.82      0.77      0.79      1957
weighted avg       0.83      0.83      0.82      1957



In [23]:
import numpy as np
import gradio as gr



def classify_sentiment(text):
    cleaned_text = getCleanedText(text)
    new_vec = cv.transform([cleaned_text])
    
    # Assuming mn has predict_proba method
    if hasattr(mn, "predict_proba"):
        probabilities = mn.predict_proba(new_vec)
        max_prob_index = np.argmax(probabilities)
        max_prob = probabilities[0, max_prob_index]
        
        sentiment_labels = mn.classes_  
        return sentiment_labels[max_prob_index], f"Confidence: {max_prob * 100:.2f}%"
    else:
        prediction = mn.predict(new_vec)
        return prediction[0], "Model doesn't support probability estimation"

# Gradio Interface setup
interface = gr.Interface(
    fn=classify_sentiment,
    title="Sentiment Analysis",
    description="Enter a tweet and get its sentiment classification and confidence.",
    inputs=gr.Textbox(label="Enter text for sentiment analysis"),
    outputs=[gr.Label(label="Predicted Sentiment"), gr.Label(label="Confidence")]
)

interface.launch(share=True)


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://aa61e1eebc8982fa58.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [24]:
import pickle
pickle.dump(mn, open('inter.pkl','wb') )