In [1]:
import nltk

nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [2]:
from nltk.corpus import twitter_samples

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweets[50]

'@groovinshawn they are rechargeable and it normally comes with a charger when u buy it :)'

In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[50])

['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']


In [5]:
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [6]:
pos_tag(tweet_tokens[50])

[('@groovinshawn', 'NN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('rechargeable', 'JJ'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('normally', 'RB'),
 ('comes', 'VBZ'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('charger', 'NN'),
 ('when', 'WRB'),
 ('u', 'JJ'),
 ('buy', 'VB'),
 ('it', 'PRP'),
 (':)', 'JJ')]

In [7]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.corpus import wordnet as wn

word_synset = wn.synsets("car")
print("synsets:", word_synset)
print("lemma names:", word_synset[0].lemma_names())

synsets: [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
lemma names: ['car', 'auto', 'automobile', 'machine', 'motorcar']


In [9]:
word_synset[0].definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [10]:
word_synset[0].examples()

['he needs a car to get to work']

In [11]:
word_synset[1].definition()

'a wheeled vehicle adapted to the rails of railroad'

In [12]:
word_synset[1].examples()

['three cars had jumped the rails']

In [13]:
word_synset[0].hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [14]:
word_synset[0].hypernyms()

[Synset('motor_vehicle.n.01')]

In [16]:
tree = wn.synsets("tree")[0]
paths = tree.hypernym_paths()
for p in paths:
    print([synset.name() for synset in p])

['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'living_thing.n.01', 'organism.n.01', 'plant.n.02', 'vascular_plant.n.01', 'woody_plant.n.01', 'tree.n.01']


In [17]:
tree.part_meronyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

In [18]:
tree.substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

In [19]:
tree.member_holonyms()

[Synset('forest.n.01')]

In [20]:
from nltk.stem.wordnet import WordNetLemmatizer
tokens = tweet_tokens[50]

In [21]:
# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

In [22]:
def get_wordnet_pos(tag):
    if tag.startswith('NN'):
        return 'n'  
    elif tag.startswith('VB'):
        return 'v'  
    elif tag.startswith('JJ'):
        return 'a'  
    else:
        return 'a'  


def lemmatize_sentence(tokens):
    lemmatized_sentence = []
    
    pos_tags = pos_tag(tokens)
    
    for token, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)

        lemmatized_token = lemmatizer.lemmatize(token, wordnet_pos)
        lemmatized_sentence.append(lemmatized_token)
    
    return lemmatized_sentence

lemmatize_sentence(tokens)

['@groovinshawn',
 'they',
 'be',
 'rechargeable',
 'and',
 'it',
 'normally',
 'come',
 'with',
 'a',
 'charger',
 'when',
 'u',
 'buy',
 'it',
 ':)']

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))
for i in range(10):
    print(stop_words[i])

198
a
about
above
after
again
against
ain
all
am
an


In [25]:
import re, string
from nltk.corpus import wordnet as wn

def process_tokens(tweet_tokens):
    cleaned_tokens = []
    stop_words = set(stopwords.words('english'))

    for token, tag in pos_tag(tweet_tokens):
        
        if re.match(r'http[s]?://\S+', token):
            continue
        
        if token.startswith('@'):
            continue

        if token.lower() in stop_words:
            continue
            
        if token in string.punctuation:
            continue
            
        if token.startswith('#'):
            continue
        
        token = token.lower()
        wordnet_pos = get_wordnet_pos(tag)
        synsets = wn.synsets(token, pos=wordnet_pos)
        
        if synsets:
            lemmatized_token = synsets[0].lemmas()[0].name()
        else:
            lemmatized_token = token
        
        cleaned_tokens.append(lemmatized_token)
    
    return cleaned_tokens

print("Before:", tweet_tokens[50])
print("After:", process_tokens(tweet_tokens[50]))


Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


In [26]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tweet) for tweet in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tweet) for tweet in negative_tweet_tokens]

print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', ':d']


In [27]:
from nltk import FreqDist

def get_all_words(cleaned_tokens_list):
    all_words = []
    for tweet in cleaned_tokens_list:
        all_words.extend(tweet)
    return all_words

all_pos_words = get_all_words(positive_cleaned_tokens_list)

print("First 10 words from all positive tokens:", all_pos_words[:10])

First 10 words from all positive tokens: ['top', 'prosecute', 'member', 'community', 'week', ':)', 'hey', 'James', 'odd', ':/']


In [28]:
fdist_pos = FreqDist(all_pos_words)

print(fdist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 383), ('follow', 362), ('love', 337), ('...', 290), ('get', 269), ('thank', 258), ('good', 238)]


In [29]:
from nltk.corpus import wordnet as wn

def semantic_distance(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)

    if not synsets1 or not synsets2:
        return float('inf')  

    min_distance = float('inf')

    for syn1 in synsets1:
        for syn2 in synsets2:
            common_hypernyms = syn1.common_hypernyms(syn2) 
            
            if common_hypernyms:
                lch = common_hypernyms[0] 
                distance = syn1.shortest_path_distance(lch) + syn2.shortest_path_distance(lch)
                min_distance = min(min_distance, distance)

    return min_distance if min_distance != float('inf') else None  


word1 = "sister"
word2 = "brother"
print(f"Semantic distance between '{word1}' and '{word2}':", semantic_distance(word1, word2))


Semantic distance between 'sister' and 'brother': 12


In [30]:
from datasets import load_dataset

dataset = load_dataset("gxb912/large-twitter-tweets-sentiment")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'text'],
        num_rows: 179995
    })
    test: Dataset({
        features: ['sentiment', 'text'],
        num_rows: 44999
    })
})

In [31]:
[tweet_tokens for tweet_tokens in positive_cleaned_tokens_list][0]

['top', 'prosecute', 'member', 'community', 'week', ':)']

In [32]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens)
    
def get_tweets_for_model(cleaned_tokens_list):   
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [33]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [34]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2065.4 : 1.0
                      :) = True           Positi : Negati =   1659.6 : 1.0
                     sad = True           Negati : Positi =     26.0 : 1.0
                 welcome = True           Positi : Negati =     23.1 : 1.0
                     Bam = True           Positi : Negati =     20.4 : 1.0
                  arrive = True           Positi : Negati =     19.9 : 1.0
                followed = True           Negati : Positi =     13.7 : 1.0
                    miss = True           Negati : Positi =     13.6 : 1.0
                 web_log = True           Positi : Negati =     12.4 : 1.0
                   didnt = True           Negati : Positi =     12.3 : 1.0
None


In [35]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [36]:
from nltk.tokenize import word_tokenize

custom_tweet = "the service was not good"

custom_tokens = process_tokens(word_tokenize(custom_tweet))

print(classifier.classify(get_token_dict(custom_tokens)))

Positive


In [37]:
def get_sentiment(text):
    custom_tokens = process_tokens(word_tokenize(text))
    return classifier.classify(get_token_dict(custom_tokens))

texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they gave me their money"]
for t in texts:
    print(t, ": ", get_sentiment(t))

bad :  Negative
service is bad :  Negative
service is really bad :  Negative
service is so terrible :  Negative
great service :  Positive
they gave me their money :  Negative


In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.classify import NaiveBayesClassifier
from nltk import classify

def process_tokens(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def preprocess_data(dataset):
    processed_data = []
    for tweet, label in zip(dataset['text'], dataset['sentiment']):
        tokens = process_tokens(word_tokenize(tweet))
        processed_data.append((' '.join(tokens), 'Positive' if label == 1 else 'Negative'))
    return processed_data

train_data = preprocess_data(dataset['train'])
test_data = preprocess_data(dataset['test'])

train_data_nb = [(dict([(token, True) for token in process_tokens(word_tokenize(tweet))]), label) for tweet, label in train_data]
test_data_nb = [(dict([(token, True) for token in process_tokens(word_tokenize(tweet))]), label) for tweet, label in test_data]

naive_bayes_classifier = NaiveBayesClassifier.train(train_data_nb)

naive_bayes_accuracy = classify.accuracy(naive_bayes_classifier, test_data_nb)
print("Naive Bayes Classifier Accuracy:", naive_bayes_accuracy)

vectorizer = CountVectorizer()

train_texts = [tweet for tweet, _ in train_data]
train_labels = [label for _, label in train_data]

test_texts = [tweet for tweet, _ in test_data]
test_labels = [label for _, label in test_data]


X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(X_train, train_labels)
logreg_predictions = logreg_classifier.predict(X_test)

logreg_accuracy = accuracy_score(test_labels, logreg_predictions)
print("Logistic Regression Accuracy:", logreg_accuracy)



Naive Bayes Classifier Accuracy: 0.7401942265383675
Logistic Regression Accuracy: 0.7892397608835752
Naive Bayes Accuracy: 0.7401942265383675


In [47]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

naive_bayes_predictions = [naive_bayes_classifier.classify(tweet) for tweet, _ in test_data_nb]
print("Naive Bayes Classification Report:")
print(classification_report(test_labels, naive_bayes_predictions))


logreg_predictions = logreg_classifier.predict(X_test)

print("Logistic Regression Classification Report:")
print(classification_report(test_labels, logreg_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Negative       0.64      0.88      0.74     18967
    Positive       0.88      0.64      0.74     26032

    accuracy                           0.74     44999
   macro avg       0.76      0.76      0.74     44999
weighted avg       0.78      0.74      0.74     44999

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.71      0.74     18967
    Positive       0.80      0.85      0.82     26032

    accuracy                           0.79     44999
   macro avg       0.79      0.78      0.78     44999
weighted avg       0.79      0.79      0.79     44999

