In [1]:
import pandas as pd


In [2]:


df = pd.read_csv('twitter_training.csv', header=None)  # No header  in file
df.columns = ['id', 'entity', 'sentiments', 'reviews']
df = df.reset_index(drop=True)


In [3]:
df

Unnamed: 0,id,entity,sentiments,reviews
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df['sentiments'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          74682 non-null  int64 
 1   entity      74682 non-null  object
 2   sentiments  74682 non-null  object
 3   reviews     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
df.isnull().sum()

id              0
entity          0
sentiments      0
reviews       686
dtype: int64

In [7]:
df = df.dropna(subset=['reviews'])


In [8]:
import nltk 
import re
from nltk.corpus import stopwords
from nltk.stem  import PorterStemmer
ps = PorterStemmer()

# Text Cleaning

In [10]:
# corpus =[]
# for i in df['reviews']:
#     rp = re.sub(r"http\S+|www\S+|https\S+", '', i, flags=re.MULTILINE)  # remove URLs
#     rp = re.sub('[^a-zA-Z]', " ", rp)
#     rp = rp.lower()
#     rp = rp.split()
#     rp =[ps.stem(word) for word in rp if not word in set(stopwords.words('english'))]
#     rp = " ".join(rp)
#     if len(rp.strip()) > 2:
#         corpus.append(rp)

In [11]:
corpus = []
filtered_labels = []

for review, sentiment in zip(df['reviews'], df['sentiments']):
    rp = re.sub(r"http\S+|www\S+|https\S+", '', review, flags=re.MULTILINE)  # Remove URLs
    rp = re.sub('[^a-zA-Z]', " ", rp)  # Keep only letters
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if word not in set(stopwords.words('english'))]
    rp = " ".join(rp)
    
    if len(rp.strip()) > 2:  # Only keep non-empty cleaned reviews
        corpus.append(rp)
        filtered_labels.append(sentiment)  


In [12]:
corpus

['im get borderland murder',
 'come border kill',
 'im get borderland kill',
 'im come borderland murder',
 'im get borderland murder',
 'im get borderland murder',
 'spent hour make someth fun know huge borderland fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 'spent coupl hour someth fun know huge borderland fan maya one favorit charact decid make wallpap pc origin pictur compar creation made fun pic twitter com mlsi wf jg',
 'spent hour someth fun know huge borderland fan maya one favorit charact',
 'spent hour make someth fun know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 'spent hour make someth fun know huge rhandlerr fan maya one favorit charact decid make wallpap pc origin imag versu creation made enjoy pic twitter com mlsi wf jg',
 'rock hard la varlop rare power handsom jackpot borderland xbox dlvr rmtrgf',
 'rock h

# Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=6000,ngram_range=(1,3))
X = vectorizer.fit_transform(corpus).toarray()

X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(filtered_labels)
y

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)

# Modeling


In [17]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X,y)


In [18]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))


Accuracy: 0.6806331986494714
