## Load the dataset

In [2]:
import pandas as pd

df=pd.read_csv('twitter_training.csv',header=None)
# df.info()
print(df.head())


      0            1         2  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                   3  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [3]:
# renaming the columns headings

df.columns = ['id', 'topic', 'sentiment', 'text']
df.head()

Unnamed: 0,id,topic,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
print('column names:',df.columns)
df['sentiment'].unique()

column names: Index(['id', 'topic', 'sentiment', 'text'], dtype='object')


array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [5]:
# removing irrelevant datas

df=df[df['sentiment'].isin(['Positive','Neutral','Negative'])]
df['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [6]:
df.shape

(61692, 4)

In [7]:
df['sentiment'].value_counts()

sentiment
Negative    22542
Positive    20832
Neutral     18318
Name: count, dtype: int64

## Preprocessing the text

In [8]:
df['text'].isna().sum()

np.int64(571)

In [9]:
import nltk
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

# Text cleaning
def preprocess(text):
    text=str(text).lower()
    text=''.join([c for c in text if c not in string.punctuation])
    tokens=text.split()
    tokens=[word for word in tokens if word not in stop_words]
    return ' '.join(tokens)
df['cleaned_text']=df['text'].apply(preprocess)

  from scipy.stats import fisher_exact
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hibam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## convert text to features (TF-IDF Vectorization)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(ngram_range=(1,2),max_features=7000)
X=vectorizer.fit_transform(df['cleaned_text'])
y=df['sentiment']

## Train_Test split


In [11]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Train the classification model

In [12]:
from sklearn.linear_model import LogisticRegression

model=LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


## Model evaluation

In [13]:
from sklearn.metrics import classification_report

y_pred=model.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    Negative       0.80      0.80      0.80      4509
     Neutral       0.71      0.71      0.71      3650
    Positive       0.78      0.77      0.78      4180

    accuracy                           0.77     12339
   macro avg       0.76      0.76      0.76     12339
weighted avg       0.77      0.77      0.77     12339



## Predict sentiment for new text

In [14]:
def predict_sentiment(text):
    cleaned=preprocess(text)
    vec=vectorizer.transform([cleaned])
    return model.predict(vec)[0]

# Test predictions
print(predict_sentiment("This product is amazing!"))
print(predict_sentiment("It was okay, nothing special."))
print(predict_sentiment("Worst experience ever."))

Positive
Neutral
Negative
