# Data Understanding

In [16]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re

In [17]:
#data loader class
class DataLoader:
    def __init__(self,dir_name,file_name):
        self.dir_name=dir_name
        self.file_name = file_name
    
 
    def read_csv(self):
        os.chdir(self.dir_name)
        tweets_df=pd.read_csv(self.file_name)
        return tweets_df
  
    

In [18]:
#object creation
DataLoader_obj= DataLoader('../data/','processed_tweet_data.csv')

# Columns of the data

In [19]:
tweets_df=DataLoader_obj.read_csv()
tweets_df.dropna()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place


# Number of rows

In [20]:
len(tweets_df)

22000

# The first 5 rows

In [21]:
tweets_df.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,,0,2,i_ameztoy,20497,2621,,['City'],['i_ameztoy'],
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,,0,201,ZIisq,65,272,,"['China', 'Taiwan']",['IndoPac_Info'],
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,,0,0,Fin21Free,85,392,,['XiJinping'],['ZelenskyyUa'],Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,,0,0,Fin21Free,85,392,,['XiJinping'],[],Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,,0,381,VizziniDolores,910,2608,,[],['ChinaUncensored'],"Ayent, Schweiz"


# Check for missing values

In [22]:
print("The number of missing value(s) based on columns:\n{}".format(tweets_df.isnull().sum()))
print("The sum of missing value(s) is:\n{}".format(tweets_df.isnull().sum().sum()))

The number of missing value(s) based on columns:
created_at                0
source                    0
original_text             0
polarity                  0
subjectivity              0
lang                  22000
favorite_count            0
retweet_count             0
original_author           0
followers_count           0
friends_count             0
possibly_sensitive    15809
hashtags                  0
user_mentions             0
place                  9893
dtype: int64
The sum of missing value(s) is:
47702


# Data Cleaning

In [23]:
import sys
sys.path.append('../')


In [24]:
from clean_tweets_dataframe import Clean_Tweets

In [25]:
# Fill missing values 
processed_tweets = tweets_df.fillna("")

In [26]:
import re

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweet: cleaned tweet

    """
    # remove hashtags
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    # remove @ handles
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks    
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)

    return tweet

In [32]:
def data_cleaner(df: pd.DataFrame, save=False) -> pd.DataFrame:
    """Data Cleaner Function.
    Input:
        df: Pandas Dataframe
        save: Boolean value
    Output:
        df: Cleaned Dataframe

    """
    Tweet_cleaner = Clean_Tweets(df)
    df = Tweet_cleaner.remove_non_english_tweets(df)
    df = Tweet_cleaner.drop_duplicate(df)
    df = Tweet_cleaner.drop_unwanted_column(df)
    df = Tweet_cleaner.drop_unwanted_column(df)
    df = Tweet_cleaner.convert_to_datetime(df)
    df = Tweet_cleaner.convert_to_numbers(df)
    df['clean_text'] = df['original_text'].apply(df)
    df['clean_text'] =  df['clean_text'].astype(str)
    df['clean_text'] = df['clean_text'].apply(lambda x: x.lower())
    df['clean_text']= df['clean_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))


    if save:
        try: 
            df.to_csv('../data/cleaned_tweet_data.csv', index=False)
            
            print('File Successfully Saved.!!!')
        
        except Exception as e:
            print("Save failed...",e)
    return df

In [33]:
cleaned_df = data_cleaner(tweets_df, save=True)

Automation in Action...!!!
File Successfully Saved.!!!


In [31]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   created_at          0 non-null      datetime64[ns]
 1   source              0 non-null      object        
 2   original_text       0 non-null      object        
 3   polarity            0 non-null      float64       
 4   subjectivity        0 non-null      float64       
 5   lang                0 non-null      float64       
 6   favorite_count      0 non-null      int64         
 7   retweet_count       0 non-null      int64         
 8   original_author     0 non-null      object        
 9   followers_count     0 non-null      int64         
 10  friends_count       0 non-null      int64         
 11  possibly_sensitive  0 non-null      object        
 12  hashtags            0 non-null      object        
 13  user_mentions       0 non-null      object        
 14  place 