In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('twitter_training.csv')

In [3]:
data.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
tweets = data[['sentiment', 'Tweet']]

In [5]:
tweets

Unnamed: 0,sentiment,Tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [21]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  74682 non-null  object
 1   Tweet      73996 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [6]:
from collections import Counter
value_counts = Counter(tweets['sentiment'])

# Iterate through the results to print each value and its count
for value, count in value_counts.items():
    print(f"'{value}': {count} times")

'Positive': 20832 times
'Neutral': 18318 times
'Negative': 22542 times
'Irrelevant': 12990 times


# Data Prepocessing

In [7]:
train,test = train_test_split(tweets,test_size=0.2)

In [8]:
for val in train['Tweet']:
    print(val)

CHILLS
Microsoft 365 had seen a big outage today
The @EAMaddenNFL @EA I am glad too we are getting some patches needed for the game especially franchise. We need more. Even bringing back previously available functions ( i. e. stadium creator ) from Madden World games of from the past. I hope you team guys make enough programming improvements necessary to make make this game feel more worth it
Another perfectly splendid day to be a Gears fan 🥳🥳🥳 YAY TEAM!. . “Between Tactics and last year’s excellent Gears 5, it feels like the franchise is firing on all cylinders again.”
You can catch up with all of "The Xbox Has No Games Podcast" right here:.. intromediagaming.com / home / the-xbox-..... One of the best Xbox Podcast out here...
See the perfect relationship here
Camdy's mix actually which inspired me to not only be a better combo but also make it around a theme which I usually don't do and I thank them for the idea!
@FortniteGame WHY CANT I ENABLE 2FA! My website just says the article y

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
pattern = "(@\w+)|(@\s\w+)|(http.*)|(#\w+)"

In [10]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
#nltk.download('wordnet')
#nltk.download('stopwords')

In [24]:
def cleaning(data):
    cleaned_tweets = []
    sentiment = []

    for index, row in data.iterrows():
        if not pd.isna(row['Tweet']) and isinstance(row['Tweet'], str):
            sentence = re.sub(pattern, '', row['Tweet'])
            words = [e.lower() for e in sentence.split()]
            words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
            cleaned_tweets.append(' '.join(words))
            sentiment.append(row['sentiment'])

    return cleaned_tweets, sentiment

In [25]:
train_tweets, train_sentiment = cleaning(train)

In [30]:
#Cleaning for Test
test_tweets, test_sentiment = cleaning(test)

In [29]:
train_tweets

['chill',
 'microsoft 365 seen big outage today',
 'glad getting patch needed game especially franchise. need more. even bringing back previously available function ( i. e. stadium creator ) madden world game past. hope team guy make enough programming improvement necessary make make game feel worth',
 'another perfectly splendid day gear fan 🥳🥳🥳 yay team!. . “between tactic last year’s excellent gear 5, feel like franchise firing cylinder again.”',
 'catch "the xbox game podcast" right here:.. intromediagaming.com / home / the-xbox-..... one best xbox podcast here...',
 'see perfect relationship',
 "camdy's mix actually inspired better combo also make around theme usually thank idea!",
 'cant enable 2fa! website say article using invalid. complete fuck',
 '10 greatest innovation league legend history: youtu.be / ehbhmtrktlw',
 'ban fab 4 player fakeanchorman landed | details: bf4db.com/player/ban/184…',
 'best thing disney + verizon free year series.. 1995 spider-man... still favorite

In [37]:
final_data = {'tweets': train_tweets, 'sentiment': train_sentiment}

In [38]:
processed_data = pd.DataFrame(final_data)

In [39]:
processed_data

Unnamed: 0,tweets,sentiment
0,chill,Irrelevant
1,microsoft 365 seen big outage today,Negative
2,glad getting patch needed game especially fran...,Neutral
3,another perfectly splendid day gear fan 🥳🥳🥳 ya...,Irrelevant
4,"catch ""the xbox game podcast"" right here:.. in...",Irrelevant
...,...,...
59193,hate league legend :d pic.twitter.com/twkakwy8ab,Negative
59194,fortnite pro dub banned twitch using racist in...,Neutral
59195,wale nba 2k sound track criminal,Negative
59196,say wow.,Positive


In [40]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
processed_data['sentiment'] = labelencoder.fit_transform(processed_data['sentiment'])

In [42]:
processed_data

Unnamed: 0,tweets,sentiment
0,chill,0
1,microsoft 365 seen big outage today,1
2,glad getting patch needed game especially fran...,2
3,another perfectly splendid day gear fan 🥳🥳🥳 ya...,0
4,"catch ""the xbox game podcast"" right here:.. in...",0
...,...,...
59193,hate league legend :d pic.twitter.com/twkakwy8ab,1
59194,fortnite pro dub banned twitch using racist in...,2
59195,wale nba 2k sound track criminal,1
59196,say wow.,3


In [44]:
from collections import Counter
value_counts = Counter(processed_data['sentiment'])

# Iterate through the results to print each value and its count
for value, count in value_counts.items():
    print(f"'{value}': {count} times")

print("0 = Irrelevant, 1 = Negative, 2 = Neutal, 3 = Positive")

'0': 10298 times
'1': 17941 times
'2': 14437 times
'3': 16522 times
0 = Irrelevant, 1 = Negative, 2 = Neutal, 3=Positive
