# Sentiment Analysis

## Imports

In [39]:
# pandas library and other Python modules
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from os import chdir
from os.path import isfile, join
from random import shuffle

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import numpy as np # linear algebra
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from scipy.stats import uniform
from scipy.sparse import csr_matrix

## Data Preparation

Here, what we have are tweets. I will be considering each tweet as a document.
Let's follow the following steps:
- Read the cached CSV
- Take the polarity, sensetivity, and text columns
- split into train and test set

In [40]:
#data loader class
class DataLoader:
    def __init__(self,dir_name,file_name):
        self.dir_name = dir_name
        self.file_name = file_name


    def read_csv(self):
        chdir(self.dir_name)
        tweets_df = pd.read_csv(self.file_name)
        chdir("notebooks")
        return tweets_df

In [41]:
filename = 'processed_tweet_data.csv'
loader = DataLoader('../', filename)
tweets_df = loader.read_csv()
print(tweets_df.shape)
print(tweets_df.columns)

(24637, 15)
Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',
       'lang', 'favorite_count', 'retweet_count', 'original_author',
       'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags',
       'user_mentions', 'place'],
      dtype='object')


In [124]:
df = tweets_df[['original_text', 'polarity', 'subjectivity', 'possibly_sensitive']]
print(df.shape)
df.head()

(24637, 4)


Unnamed: 0,original_text,polarity,subjectivity,possibly_sensitive
0,RT @nikitheblogger: Irre: Annalena Baerbock sa...,0.0,0.0,
1,RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 M...,0.0,0.0,
2,RT @Kryptonoun: @WRi007 Pharma in Lebensmittel...,0.0,0.0,
3,RT @WRi007: Die #Deutschen sind ein braves Vol...,0.0,0.0,
4,RT @RolandTichy: Baerbock verkündet mal so neb...,0.0,0.0,


In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24637 entries, 0 to 24636
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   original_text       24633 non-null  object 
 1   polarity            24625 non-null  object 
 2   subjectivity        24625 non-null  float64
 3   possibly_sensitive  9982 non-null   object 
dtypes: float64(1), object(3)
memory usage: 770.0+ KB


In [126]:
print("Null entry count per column")
df.isnull().sum()

Null entry count per column


original_text             4
polarity                 12
subjectivity             12
possibly_sensitive    14655
dtype: int64

For now I will only drop rows where text is null

In [127]:
null_rows = df['polarity'].isna().tolist()
df = df.drop(df[null_rows].index)
df = df[df['polarity']!="en"]
df = df[df['polarity']!="ko"]

In [128]:
print("Null entry count per column")
df.isnull().sum()

Null entry count per column


original_text             0
polarity                  0
subjectivity              0
possibly_sensitive    14643
dtype: int64

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24617 entries, 0 to 24636
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   original_text       24617 non-null  object 
 1   polarity            24617 non-null  object 
 2   subjectivity        24617 non-null  float64
 3   possibly_sensitive  9974 non-null   object 
dtypes: float64(1), object(3)
memory usage: 961.6+ KB


In [130]:
%cd ..
from clean_tweets_dataframe import Clean_Tweets
%cd notebooks

/home/hat/dev-env/10Acadamy/week_0/Twitter-Data-Analysis
/home/hat/dev-env/10Acadamy/week_0/Twitter-Data-Analysis/notebooks


In [131]:
cleaner = Clean_Tweets(df)
# df = cleaner.convert_to_numbers()
column_names = ['polarity', 'subjectivity']
for column in column_names:
    print(column)
    df[column] = pd.to_numeric(df[column])
df.info()

Automation in Action...!!!
polarity
subjectivity
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24617 entries, 0 to 24636
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   original_text       24617 non-null  object 
 1   polarity            24617 non-null  float64
 2   subjectivity        24617 non-null  float64
 3   possibly_sensitive  9974 non-null   object 
dtypes: float64(2), object(2)
memory usage: 961.6+ KB


There are lots of missing values in the sensitivity colum, so i will drop it

In [132]:
df = df.drop(columns='possibly_sensitive')
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,original_text,polarity,subjectivity
0,RT @nikitheblogger: Irre: Annalena Baerbock sa...,0.0,0.0
1,RT @sagt_mit: Merkel schaffte es in 1 Jahr 1 M...,0.0,0.0
2,RT @Kryptonoun: @WRi007 Pharma in Lebensmittel...,0.0,0.0
3,RT @WRi007: Die #Deutschen sind ein braves Vol...,0.0,0.0
4,RT @RolandTichy: Baerbock verkündet mal so neb...,0.0,0.0
