In [4]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.0 kB 682.7 kB/s eta 0:00:01
     --------------------------- ---------- 30.7/42.0 kB 445.2 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 406.8 kB/s eta 0:00:00
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl (268 kB)
   ---------------------------------------- 0.0/268.5 kB ? eta -:--:--
   --------- ------------------------------ 61.4/268.5 kB 1.7 MB/s eta 0:00:01
   ------------- -------------------------- 92.2/268

In [5]:
# create a naive bayes to classify the sentiment of the tweets

# import the necessary libraries
import nltk
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.classify import NaiveBayesClassifier



In [6]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\Mosub
[nltk_data]     Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [7]:
# get the positive and negative tweets from the twitter samples corpus
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [8]:
#remove stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Mosub
[nltk_data]     Gamal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
len(negative_tweets)

5000

In [10]:
# create a function to clean the tweets
import re
def clean_tweet(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Tokenize
    tokens = word_tokenize(tweet)
    
    # Stemming
    stemmer = PorterStemmer()
    cleaned_tweet = [stemmer.stem(word) for word in tokens if word.isalpha()]
    
    # Remove stop words
    stopwords = set(nltk.corpus.stopwords.words('english'))
    cleaned_tweet = [word for word in cleaned_tweet if word not in stopwords]
    return cleaned_tweet

In [11]:
# show stop words
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
for i in range(5):
    print("Dirty Tweet :", negative_tweets[i])
    print("clean tweet :", clean_tweet(negative_tweets[i]))
    print()


Dirty Tweet : hopeless for tmr :(
clean tweet : ['hopeless', 'tmr']

Dirty Tweet : Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
clean tweet : ['everyth', 'kid', 'section', 'ikea', 'cute', 'shame', 'nearli', 'month']

Dirty Tweet : @Hegelbon That heart sliding into the waste basket. :(
clean tweet : ['hegelbon', 'heart', 'slide', 'wast', 'basket']

Dirty Tweet : “@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
clean tweet : ['ketchburn', 'hate', 'japanes', 'call', 'bani']

Dirty Tweet : Dang starting next week I have "work" :(
clean tweet : ['dang', 'start', 'next', 'week', 'work']



In [13]:
# compute frequency distribution of words for positive and negative tweets
pos_freq = nltk.FreqDist([word for tweet in positive_tweets for word in clean_tweet(tweet)])
pos_freq

FreqDist({'thank': 642, 'follow': 446, 'love': 399, 'thi': 304, 'u': 247, 'day': 242, 'good': 238, 'like': 232, 'happi': 211, 'get': 209, ...})

In [14]:
# compute frequency distribution of words for positive and negative tweets
neg_freq = nltk.FreqDist([word for tweet in negative_tweets for word in clean_tweet(tweet)])
neg_freq

FreqDist({'thi': 319, 'miss': 301, 'pleas': 275, 'wa': 263, 'follow': 263, 'want': 246, 'get': 233, 'go': 223, 'like': 223, 'u': 189, ...})

In [15]:
# convert neg/pos freq dist to one pandas dataframe
import pandas as pd
pos_df = pd.DataFrame({'Word': list(pos_freq.keys()), 'Frequency': list(pos_freq.values())})
pos_df['Type'] = 'Positive'
neg_df = pd.DataFrame({'Word': list(neg_freq.keys()), 'Frequency': list(neg_freq.values())})
neg_df['Type'] = 'Negative'
# combine pos and neg dataframes
tweets_df = pd.concat([pos_df, neg_df])
tweets_df.head()

Unnamed: 0,Word,Frequency,Type
0,followfriday,25,Positive
1,top,31,Positive
2,engag,7,Positive
3,member,16,Positive
4,commun,33,Positive


In [25]:
# pivot table to show frequency of words across positive/negative tweets
tweets_pivot = tweets_df.pivot(index='Word', columns='Type', values='Frequency').fillna(0)
tweets_pivot.sort_values('Positive', ascending=False).head()

# convert it to 3 by m frame
tweets_pivot = tweets_pivot.reset_index()

# # Remove Type column
# tweets_pivot = tweets_pivot.drop('Type', axis=1)

# Get only the last three columns
tweets_pivot = tweets_pivot[['Word','Positive','Negative']]
tweets_pivot.head()

Type,Word,Positive,Negative
0,aa,2.0,0.0
1,aaaaaaaaaaa,0.0,1.0
2,aaaaaaaaaaaaa,0.0,1.0
3,aaaaaaaaaaaah,0.0,1.0
4,aaaaaand,1.0,0.0


In [2]:
# fix seaborn library
! pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Downloading matplotlib-3.9.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading contourpy-1.2.1-cp312-cp312-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading fonttools-4.53.1-cp312-cp312-win_amd64.whl.metadata (165 kB)
     ---------------------------------------- 0.0/165.9 kB ? eta -:--:--
     -- ------------------------------------- 10.2/165.9 kB ? eta -:--:--
     --------- --------------------------- 41.0/165.9 kB 388.9 kB/s eta 0:00:01
     -------------------- ---------------- 92.2/165.9 kB 744.7 kB/s eta 0:00:01
     ------------------------------------ 165.9/165.9 kB 905.1 kB/s eta

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
# plot word frequency distribution for positive and negative tweets
sns.barplot(x='Word', y='Frequency', hue='Type', data=tweets_df)


ModuleNotFoundError: No module named 'seaborn'