# Preprocessing all file

In [1]:
import os
import pandas as pd
import emoji
import numpy as np
import re
import string
import enchant
import nltk 
from nltk.corpus import stopwords
from textblob import TextBlob

In [2]:
## pd.set_option('display.max_colwidth',2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('mode.use_inf_as_na', True) 


#Function for cleaning text of tweets
def clean_text(text):
    
    #Removing URL
    text = re.sub(r"http\S+", "", text)
    #Removing emoji/stickers
    text = re.sub(emoji.get_emoji_regexp(), r"", text)
    #Removing punctuation and tokenizing
    text_token = "".join([x.lower() for x in text if x not in punct]).split()
    #Removing non english word
    clean_text = [x for x in text_token if d.check(x) == True]
    #Lemmatizing and stemming
    cleaned_text = [stem.stem(x) if x.endswith("ing") else lemma.lemmatize(x) for x in  clean_text]
    
    return cleaned_text


#Function to get the polarity value of a tweet
def polarity_value(tweet):
    
    sentence = str(tweet)
    analysis = TextBlob(sentence)
    polarity = analysis.sentiment.polarity
    
    return polarity
   
#Function for returning tweet type
def classify_tweet(polarity):
    
    if polarity > 0.1:
        return 'Positive tweet'

    elif polarity < 0:
        return 'Negative tweet'
        
    else:
        return 'Neutral tweet'
    

# Function for counting word in tweet
def word_count(x):
    total_word  = len(x.split())
    return total_word


#Function for counting number of hashtag
def count_hashtag(tweet):
    count = 0
    hashtag = '#'
    tweet = tweet.split()
    for word in tweet:
        if hashtag in word:
            count = count + 1
    return count
    
    
#Setting punctuation
punct = set(string.punctuation + "“—\u200b‘•’\u2060┉⊰\u2063”")

#Initializing lemmatizer and stemmer
lemma = nltk.WordNetLemmatizer()
stem = nltk.PorterStemmer()
d = enchant.Dict("en_US")
    

directory = "raw_trending_topics\\"
for root,dirs,files in os.walk(directory):
    for file in files:
       
        if file.endswith(".csv"):
            print()
            print("-----------------------------------------")
            print()
            print("File no:", file)
            print()
            filename = directory + file
            #Reading the csv file
            data = pd.read_csv(filename)
            #checking duplicate tweets
            duplicate_tweet = data.tweet_id.duplicated().sum()
            #removing duplicate tweets
            data = data.drop_duplicates(subset = "tweet_id")
            
            #Removing rows with value 0
            data = data[(data[['favorites','status_count','listed_count','favorites_count']] != 0).all(axis=1)]
            
            #Adding cleaned tweets in the dataframe
            data['cleaned_tweet']  = data['tweet'].apply(lambda x: clean_text(x))
            
            #Getting polarity of each tweet
            data['polarity']  = data['cleaned_tweet'].apply(lambda x: polarity_value(x))
            
            #Adding classified tweets in the dataframe
            data['tweet_type']  = data['polarity'].apply(lambda x: classify_tweet(x))
            
            #Creating Avg_favorites_per_post feature
            data["Avg_favorites_per_post"] = data['favorites_count']/data['status_count']
            
            #Creating listed-follower ratio  feature
            data["listed_follower_ratio"] = data['listed_count']/data['followers']
            
            
            #creating follower-following ratio feature 
            data['follower-following_ratio'] = data['followers'] /  data['friends']
            
            #replacing nan to 0
            data.replace([np.nan], 0, inplace=True) 
            
     
            #Removing unnecessary Columns
            data = data.drop(['user_name', 'screen_name','tweet_id','location','time'], axis = 1)
            
            #saving cleaned data to csv
            file = "cleaned_" + file
            filename = "cleaned_trending_topic//" + file
            data.to_csv(filename,index = False)
      


-----------------------------------------

File no: topic_1.csv


-----------------------------------------

File no: topic_10.csv


-----------------------------------------

File no: topic_100.csv


-----------------------------------------

File no: topic_11.csv


-----------------------------------------

File no: topic_12.csv


-----------------------------------------

File no: topic_13.csv


-----------------------------------------

File no: topic_14.csv


-----------------------------------------

File no: topic_15.csv


-----------------------------------------

File no: topic_16.csv


-----------------------------------------

File no: topic_17.csv


-----------------------------------------

File no: topic_18.csv


-----------------------------------------

File no: topic_19.csv


-----------------------------------------

File no: topic_2.csv


-----------------------------------------

File no: topic_20.csv


-----------------------------------------

File no: topic_21.c