# DEPI Twitter Sentiment Analysis Project

## Data Analysis (Tokens frequency for each sentiment)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
df = pd.read_csv('cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0,Target,ID,Date,Query,User,Text,words,grams,mentions,hashtags,cleaned_text,cleaned_words,cleaned_grams
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","['switchfoot', 'http', '', 'twitpiccom2y1zl', ...","['switchfoot http', 'http ', ' twitpiccom2y1zl...",['switchfoot'],[],a that s a bummer you shoulda got david carr o...,"['bummer', 'shoulda', 'got', 'david', 'carr', ...","['bummer shoulda', 'shoulda got', 'got david',..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"['is', 'upset', 'that', 'he', 'ca', 'nt', 'upd...","['is upset', 'upset that', 'that he', 'he ca',...",[],[],is upset that he can t update his facebook by ...,"['upset', 'update', 'facebook', 'texting', 'mi...","['upset update', 'update facebook', 'facebook ..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"['Kenichan', 'I', 'dived', 'many', 'times', 'f...","['Kenichan I', 'I dived', 'dived many', 'many ...",['Kenichan'],[],i dived many times for the ball managed to sav...,"['dived', 'many', 'times', 'ball', 'managed', ...","['dived many', 'many times', 'times ball', 'ba..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and...","['my whole', 'whole body', 'body feels', 'feel...",[],[],my whole body feels itchy and like its on fire,"['whole', 'body', 'feels', 'itchy', 'like', 'f...","['whole body', 'body feels', 'feels itchy', 'i..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","['nationwideclass', 'no', '', 'it', 's', 'not'...","['nationwideclass no', 'no ', ' it', 'it s', '...",['nationwideclass'],[],no it s not behaving at all i m mad why am i h...,"['behaving', 'mad', 'see']","['behaving mad', 'mad see']"


In [31]:
import pandas as pd
import ast

# Read the CSV file into a DataFrame
df = pd.read_csv('./cleaned_data.csv', low_memory=False)

# Function to convert string representations of lists back to lists
def convert_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

# Convert 'cleaned_words' and 'cleaned_grams' columns from strings to lists
df['cleaned_words'] = df['cleaned_words'].apply(convert_to_list)
df['cleaned_grams'] = df['cleaned_grams'].apply(convert_to_list)

# Now proceed with processing
# Process 'cleaned_words' (Words)
words_df = df[['Target', 'cleaned_words']].explode('cleaned_words')
words_df = words_df.rename(columns={'cleaned_words': 'Token'})
words_df['Type'] = 'Word'

print(words_df.head())

# Process 'cleaned_grams' (Bi-Grams)
bigrams_df = df[['Target', 'cleaned_grams']].explode('cleaned_grams')
bigrams_df = bigrams_df.rename(columns={'cleaned_grams': 'Token'})
bigrams_df['Type'] = 'Bi-Gram'

# Combine 'words_df' and 'bigrams_df' into a single DataFrame
tokens_df = pd.concat([words_df, bigrams_df], ignore_index=True)

# Ensure 'Token' column has string type for consistency
tokens_df['Token'] = tokens_df['Token'].astype(str)

# Proceed with the rest of the processing...

# Step 2: Group by 'Token', 'Type', and 'Target' to calculate frequencies

# Calculate frequency counts
freq = (
    tokens_df.groupby(['Token', 'Type', 'Target'])
    .size()
    .reset_index(name='Frequency')
)

# Step 3: Pivot the table to get frequencies for each sentiment

# Pivot the DataFrame to have separate columns for negative and positive frequencies
freq_pivot = freq.pivot_table(
    index=['Token', 'Type'],
    columns='Target',
    values='Frequency',
    fill_value=0
).reset_index()

# Rename the columns for clarity
freq_pivot = freq_pivot.rename(columns={
    0: 'Freq Neg',
    4: 'Freq Pos'
})

# Ensure 'Freq Neg' and 'Freq Pos' columns exist even if there are no occurrences
if 'Freq Neg' not in freq_pivot.columns:
    freq_pivot['Freq Neg'] = 0
if 'Freq Pos' not in freq_pivot.columns:
    freq_pivot['Freq Pos'] = 0

# Step 4: Calculate the Overall Frequency

freq_pivot['Overall Frequency'] = freq_pivot['Freq Neg'] + freq_pivot['Freq Pos']

# Reorder the columns
final_df = freq_pivot[['Token', 'Type', 'Overall Frequency', 'Freq Neg', 'Freq Pos']]

# Remove Tokens with 1 overall frequency
final_df = final_df[final_df['Overall Frequency'] > 10]

# Remove 'nan' Bi-Gram Token
final_df = final_df[final_df['Token'] != 'nan']

# Step 5: Display or Save the Result

print(final_df.head())

# Optionally, save the final DataFrame to a CSV file
output_path = "./token_frequencies.csv"
final_df.to_csv(output_path, index=False)
print("✅ Token frequencies saved to:", output_path)

   Target    Token  Type
0       0   bummer  Word
0       0  shoulda  Word
0       0      got  Word
0       0    david  Word
0       0     carr  Word
Target   Token  Type  Overall Frequency  Freq Neg  Freq Pos
0           aa  Word                240       156        84
177        aaa  Word                157        97        60
289       aaaa  Word                 81        47        34
345      aaaaa  Word                 39        27        12
381     aaaaaa  Word                 30        17        13
✅ Token frequencies saved to: ./token_frequencies.csv


In [42]:
final_df[final_df['Type']=='Bi-Gram'].sort_values(by='Overall Frequency', ascending=False).head()

Target,Token,Type,Overall Frequency,Freq Neg,Freq Pos
1980707,last night,Bi-Gram,12524,7556,4968
1461242,good morning,Bi-Gram,10168,1943,8225
4085130,wish could,Bi-Gram,6816,5782,1034
1192504,feel like,Bi-Gram,6559,5129,1430
2117269,looking forward,Bi-Gram,5624,2195,3429


In [39]:
final_df[final_df['Type']=='Word'].sort_values(by='Overall Frequency', ascending=False).head()

Target,Token,Type,Overall Frequency,Freq Neg,Freq Pos
1457262,good,Word,91324,29207,62117
840831,day,Word,89556,41373,48183
1378972,get,Word,82172,45610,36562
2041425,like,Word,78573,41053,37520
1423636,go,Word,74006,45634,28372
