In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:


df = pd.read_csv('twitter_training.csv', header=None)  # No header  in file
df.columns = ['id', 'entity', 'sentiments', 'reviews']
df = df.reset_index(drop=True)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          74682 non-null  int64 
 1   entity      74682 non-null  object
 2   sentiments  74682 non-null  object
 3   reviews     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [4]:
df.isnull().sum()

id              0
entity          0
sentiments      0
reviews       686
dtype: int64

In [5]:
df = df.dropna(subset=['reviews'])


In [6]:
import nltk 
import re
from nltk.corpus import stopwords
from nltk.stem  import PorterStemmer
ps = PorterStemmer()

# Text Cleaning

In [8]:
corpus = []
filtered_labels = []

for review, sentiment in zip(df['reviews'], df['sentiments']):
    rp = re.sub(r"http\S+|www\S+|https\S+", '', review, flags=re.MULTILINE)  # Remove URLs
    rp = re.sub('[^a-zA-Z]', " ", rp)  # Keep only letters
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if word not in set(stopwords.words('english'))]
    rp = " ".join(rp)
    
    if len(rp.strip()) > 2:  # Only keep non-empty cleaned reviews
        corpus.append(rp)
        filtered_labels.append(sentiment)  


# Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(corpus).toarray()



In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(filtered_labels)
y

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)

# Modeling

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

def build_ann():
    ann = Sequential()
    ann.add(Dense(input_dim=X.shape[1], units = 250 , kernel_initializer='uniform', activation='relu'))
    ann.add(Dropout(0.2))
    ann.add(Dense(units = 220 , kernel_initializer='uniform', activation='relu'))
    ann.add(Dropout(0.2))
   
    ann.add(Dense(units = 4 , kernel_initializer = 'uniform', activation='softmax'))
 
    ann.compile(optimizer='adam',loss ='sparse_categorical_crossentropy', metrics = ['accuracy'])
    return ann

In [14]:
model = build_ann()

stoping = EarlyStopping( monitor='loss',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,)

model.fit(X,y, epochs= 20, batch_size=32,validation_split=0.1,callbacks=[stoping])


Epoch 1/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 29ms/step - accuracy: 0.5870 - loss: 0.9738 - val_accuracy: 0.4541 - val_loss: 1.5656
Epoch 2/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 29ms/step - accuracy: 0.8972 - loss: 0.2990 - val_accuracy: 0.4480 - val_loss: 2.0072
Epoch 3/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 29ms/step - accuracy: 0.9564 - loss: 0.1298 - val_accuracy: 0.4472 - val_loss: 2.4685
Epoch 4/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 29ms/step - accuracy: 0.9679 - loss: 0.0893 - val_accuracy: 0.4707 - val_loss: 2.7766
Epoch 5/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 29ms/step - accuracy: 0.9725 - loss: 0.0731 - val_accuracy: 0.4384 - val_loss: 3.2500
Epoch 6/20
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 29ms/step - accuracy: 0.9738 - loss: 0.0660 - val_accuracy: 0.4637 - val_loss: 3.3389
Epoc

<keras.src.callbacks.history.History at 0x1e655cf3da0>

In [15]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X)
y_pred_classes = np.argmax(y_pred, axis=1) 
print("Accuracy:", accuracy_score(y, y_pred_classes))  # Use class labels



[1m2259/2259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 8ms/step
Accuracy: 0.9272430397963137


# PREDICT ON TEST DATA

In [17]:
df_test = pd.read_csv('twitter_validation.csv')
df_test.columns = ['id', 'entity', 'sentiment', 'reviews']
df_test = df_test.reset_index(drop=True)
df_test

Unnamed: 0,id,entity,sentiment,reviews
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [19]:
# Assuming you already have a preprocessing function like this:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_test['cleaned_reviews'] = df_test['reviews'].apply(preprocess_text)


**re.sub() ka matlab hai replace using regex.**

**r'[^a-zA-Z\s]' ka matlab:**

**^ inside [] means "not".**

**a-zA-Z means only alphabets (small and capital).**

**\s means whitespace (space, tab, newline).**

**So this regex is saying:**

**Replace anything that is not an alphabet or space with an empty string (' ').**

In [21]:
df_test

Unnamed: 0,id,entity,sentiment,reviews,cleaned_reviews
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezos rejects claims...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft why do i pay for word when it functi...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmaking is so full of closet hacking ...
3,4433,Google,Neutral,Now the President is slapping Americans in the...,now the president is slapping americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,hi eahelp ive had madeleine mccann in my cella...
...,...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...,toronto is the arts and culture capital of can...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,this is actually a good move tot bring more vi...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,today sucked so its time to drink wine n play ...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,bought a fraction of microsoft today small wins


# Featuring Scaling X & Y

In [23]:
X_test = vectorizer.transform(df_test['cleaned_reviews']).toarray()
y_test = le.transform(df_test['sentiment'])


# Predicting Text Valeus

In [25]:
y_pred_test = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_test, axis=1)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy:", accuracy_score(y_test, y_pred_classes))
print("\nClassification Report:\n", classification_report(y_test, y_pred_classes, target_names=le.classes_))


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Test Accuracy: 0.6496496496496497

Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.49      0.72      0.58       171
    Negative       0.71      0.67      0.69       266
     Neutral       0.76      0.59      0.66       285
    Positive       0.65      0.66      0.65       277

    accuracy                           0.65       999
   macro avg       0.65      0.66      0.65       999
weighted avg       0.67      0.65      0.65       999

