## The model

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv("train.csv", encoding='ISO-8859-1')
traindf = pd.read_csv("train.csv", encoding='ISO-8859-1')

In [9]:
df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [10]:
data = df['text']

In [11]:
data

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27481, dtype: object

In [12]:
def split_data_by_sentiment(data, sentiment):
    """
    Split the data DataFrame into separate lists based on sentiment.

    Parameters:
       data (DataFrame): The input DataFrame containing 'text' and 'sentiment' columns.
       sentiment (str): The sentiment label to filter the data.

    Returns:
        list: A list of text corresponding to the specified sentiment.
    """
    return data[data['sentiment'] == sentiment]['text'].tolist()

# Assuming df is your DataFrame containing 'text' and 'sentiment' columns
positive_data = split_data_by_sentiment(df, 'positive')
negative_data = split_data_by_sentiment(df, 'negative')
neutral_data = split_data_by_sentiment(df, 'neutral')

In [13]:
def preprocess_tweet(tweet):
    # Convert the tweet to lowercase
    tweet = tweet.lower()
    
    # Remove punctuation from the tweet using translation
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize the tweet into individual words
    tokens = nltk.word_tokenize(tweet)
    
    # Initialize a Porter stemmer for word stemming
    stemmer = PorterStemmer()
    
    # Get a set of English stopwords from NLTK
    stopwords_set = set(stopwords.words("english"))
    
    # Apply stemming to each token and filter out stopwords
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords_set]
    
    # Return the preprocessed tokens
    return tokens

In [14]:
import pandas as pd
from collections import defaultdict

# Load the dataset into a DataFrame (ensure this step is done)
train_df = pd.read_csv("train.csv", encoding='ISO-8859-1')  # Update the path and encoding if needed

# Define the preprocess_tweet function
def preprocess_tweet(tweet):
    if isinstance(tweet, str):  # Check if tweet is a string
        return tweet.lower().split()  # Simple tokenization for illustration
    return []  # Return an empty list for non-string entries

def calculate_word_counts(tweets):
    # Initialize a defaultdict to store word counts
    word_count = defaultdict(int)
    
    # Iterate through each tweet
    for tweet in tweets:
        # Tokenize and preprocess the tweet
        tokens = preprocess_tweet(tweet)
        
        # Increment the count for each token
        for token in tokens:
            word_count[token] += 1
    
    return word_count

# Filter out rows where 'text' is NaN before calculating word counts
train_df = train_df[train_df['text'].notna()]

# Calculate word counts for tweets with different sentiments
word_count_positive = calculate_word_counts(train_df[train_df['sentiment'] == 'positive']['text'])
word_count_negative = calculate_word_counts(train_df[train_df['sentiment'] == 'negative']['text'])
word_count_neutral = calculate_word_counts(train_df[train_df['sentiment'] == 'neutral']['text'])

# For demonstration, print out the word counts
print("Positive Word Counts:", word_count_positive)
print("Negative Word Counts:", word_count_negative)
print("Neutral Word Counts:", word_count_neutral)




In [15]:
def calculate_likelihood(word_count, total_words, laplacian_smoothing=1):
    # Create an empty dictionary to store the likelihood values
    likelihood = {}
    
    # Get the number of unique words in the vocabulary
    vocabulary_size = len(word_count)

    # Iterate through each word and its corresponding count in the word_count dictionary
    for word, count in word_count.items():
        # Calculate the likelihood using Laplacian smoothing formula
        # Laplacian smoothing is used to handle unseen words in training data
        # The formula is (count + smoothing) / (total_words + smoothing * vocabulary_size)
        likelihood[word] = (count + laplacian_smoothing) / (total_words + laplacian_smoothing * vocabulary_size)

    # Return the calculated likelihood dictionary
    return likelihood

In [16]:
import math

def calculate_log_prior(sentiment, data):
    # Calculate the natural logarithm of the ratio of tweets with the specified sentiment to the total number of tweets
    log_prior = math.log(len(data[data['sentiment'] == sentiment]) / len(data))
    
    # Return the calculated log prior
    return log_prior

# Calculate the log prior for tweets with positive sentiment
log_prior_positive = calculate_log_prior('positive', df)

# Calculate the log prior for tweets with negative sentiment
log_prior_negative = calculate_log_prior('negative', df)

# Calculate the log prior for tweets with neutral sentiment
log_prior_neutral = calculate_log_prior('neutral', df)


In [17]:
import math
from collections import defaultdict

# Assuming word_count_positive, word_count_negative, and word_count_neutral are already calculated

# Total counts of words in each sentiment category
total_positive = sum(word_count_positive.values())
total_negative = sum(word_count_negative.values())
total_neutral = sum(word_count_neutral.values())

# Calculate likelihoods for positive sentiment
likelihood_positive = {word: count / total_positive for word, count in word_count_positive.items()}

# Calculate likelihoods for negative sentiment
likelihood_negative = {word: count / total_negative for word, count in word_count_negative.items()}

# Calculate likelihoods for neutral sentiment
likelihood_neutral = {word: count / total_neutral for word, count in word_count_neutral.items()}

# Create a dictionary of log-likelihood values for positive sentiment
log_likelihood_positive = {word: math.log(prob) for word, prob in likelihood_positive.items()}

# Create a dictionary of log-likelihood values for negative sentiment
log_likelihood_negative = {word: math.log(prob) for word, prob in likelihood_negative.items()}

# Create a dictionary of log-likelihood values for neutral sentiment
log_likelihood_neutral = {word: math.log(prob) for word, prob in likelihood_neutral.items()}

# Output the log-likelihood dictionaries for verification
print("Log Likelihood Positive:", log_likelihood_positive)
print("Log Likelihood Negative:", log_likelihood_negative)
print("Log Likelihood Neutral:", log_likelihood_neutral)





In [18]:
def classify_tweet_with_scores(tweet, log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                               log_prior_positive, log_prior_negative, log_prior_neutral):
    # Tokenize and preprocess the input tweet
    tokens = preprocess_tweet(tweet)

    # Calculate the log scores for each sentiment category
    log_score_positive = log_prior_positive + sum([log_likelihood_positive.get(token, 0) for token in tokens])
    log_score_negative = log_prior_negative + sum([log_likelihood_negative.get(token, 0) for token in tokens])
    log_score_neutral = log_prior_neutral + sum([log_likelihood_neutral.get(token, 0) for token in tokens])

    # Store the sentiment scores in a dictionary
    sentiment_scores = {
        'positive': log_score_positive,
        'negative': log_score_negative,
        'neutral': log_score_neutral
    }

    # Determine the predicted sentiment based on the highest sentiment score
    predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    
    # Return the predicted sentiment and the sentiment scores
    return predicted_sentiment, sentiment_scores

## Training

In [19]:
train_df = pd.read_csv("train.csv", encoding='ISO-8859-1')
new_data_df = pd.read_csv("test.csv", encoding='ISO-8859-1')  
combined_df = pd.concat([train_df, new_data_df], ignore_index=True)
combined_df = combined_df[combined_df['text'].notna()]

In [20]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [21]:
positive_data = split_data_by_sentiment(train_df, 'positive')
negative_data = split_data_by_sentiment(train_df, 'negative')
neutral_data = split_data_by_sentiment(train_df, 'neutral')

In [22]:
word_count_positive = calculate_word_counts(positive_data)
word_count_negative = calculate_word_counts(negative_data)
word_count_neutral = calculate_word_counts(neutral_data)

In [23]:
total_positive = sum(word_count_positive.values())
total_negative = sum(word_count_negative.values())
total_neutral = sum(word_count_neutral.values())

In [24]:
likelihood_positive = calculate_likelihood(word_count_positive, total_positive)
likelihood_negative = calculate_likelihood(word_count_negative, total_negative)
likelihood_neutral = calculate_likelihood(word_count_neutral, total_neutral)

In [25]:
log_likelihood_positive = {word: math.log(prob) for word, prob in likelihood_positive.items()}
log_likelihood_negative = {word: math.log(prob) for word, prob in likelihood_negative.items()}
log_likelihood_neutral = {word: math.log(prob) for word, prob in likelihood_neutral.items()}

In [26]:
log_prior_positive = calculate_log_prior('positive', train_df)
log_prior_negative = calculate_log_prior('negative', train_df)
log_prior_neutral = calculate_log_prior('neutral', train_df)

In [39]:
test_tweet = "Love"
predicted_sentiment, sentiment_scores = classify_tweet_with_scores(
    test_tweet, 
    log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
    log_prior_positive, log_prior_negative, log_prior_neutral
)

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Sentiment Scores: {sentiment_scores}")

Predicted Sentiment: positive
Sentiment Scores: {'positive': -6.2216628663707105, 'negative': -9.267396767265332, 'neutral': -7.674161738769002}
