In [60]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [61]:
# Import tweet csv
tweets_df = pd.read_csv("../TrumpTweets.csv")

tweets_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,The United States has foolishly given Pakistan...,1/1/2018 12:12,101056.0,304676.0,,9.47803e+17
1,Twitter for iPhone,Iran is failing at every level despite the ter...,1/1/2018 12:44,29046.0,111467.0,,9.47811e+17
2,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...,1/1/2018 13:37,16884.0,114754.0,,9.47824e+17
3,Twitter for iPhone,The people of Iran are finally acting against ...,1/2/2018 12:09,28227.0,105965.0,,9.48164e+17
4,Twitter for iPhone,Crooked Hillary Clinton’s top aid Huma Abedin ...,1/2/2018 12:48,37561.0,130933.0,,9.48174e+17


In [62]:
# https://towardsdatascience.com/selenium-tweepy-to-scrap-tweets-from-tweeter-and-analysing-sentiments-1804db3478ac
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    
    return text
# We are keeping cleaned tweets in a new column called 'tidy_tweets'
tweets_df['tidy_tweets'] = np.vectorize(remove_pattern)(tweets_df['text'], "@[\w]*: | *RT*")
tweets_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,tidy_tweets
0,Twitter for iPhone,The United States has foolishly given Pakistan...,1/1/2018 12:12,101056.0,304676.0,,9.47803e+17,The United States has foolishly given Pakistan...
1,Twitter for iPhone,Iran is failing at every level despite the ter...,1/1/2018 12:44,29046.0,111467.0,,9.47811e+17,Iran is failing at every level despite the ter...
2,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...,1/1/2018 13:37,16884.0,114754.0,,9.47824e+17,Will be leaving Florida for Washington (D.C.) ...
3,Twitter for iPhone,The people of Iran are finally acting against ...,1/2/2018 12:09,28227.0,105965.0,,9.48164e+17,The people of Iran are finally acting against ...
4,Twitter for iPhone,Crooked Hillary Clinton’s top aid Huma Abedin ...,1/2/2018 12:48,37561.0,130933.0,,9.48174e+17,Crooked Hillary Clinton’s top aid Huma Abedin ...


In [63]:
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

tweets_df['tidy_tweets'] = cleaned_tweets
tweets_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,tidy_tweets
0,Twitter for iPhone,The United States has foolishly given Pakistan...,1/1/2018 12:12,101056.0,304676.0,,9.47803e+17,The United States has foolishly given Pakistan...
1,Twitter for iPhone,Iran is failing at every level despite the ter...,1/1/2018 12:44,29046.0,111467.0,,9.47811e+17,Iran is failing at every level despite the ter...
2,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...,1/1/2018 13:37,16884.0,114754.0,,9.47824e+17,Will be leaving Florida for Washington (D.C.) ...
3,Twitter for iPhone,The people of Iran are finally acting against ...,1/2/2018 12:09,28227.0,105965.0,,9.48164e+17,The people of Iran are finally acting against ...
4,Twitter for iPhone,Crooked Hillary Clinton’s top aid Huma Abedin ...,1/2/2018 12:48,37561.0,130933.0,,9.48174e+17,Crooked Hillary Clinton’s top aid Huma Abedin ...


In [71]:
tweets_df['absolute_tidy_tweets'] = tweets_df['tidy_tweets'].str.replace("[^a-zA-Z# ]", "")
tweets_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,tidy_tweets,absolute_tidy_tweets
0,Twitter for iPhone,The United States has foolishly given Pakistan...,1/1/2018 12:12,101056.0,304676.0,,9.47803e+17,The United States has foolishly given Pakistan...,The United States has foolishly given Pakistan...
1,Twitter for iPhone,Iran is failing at every level despite the ter...,1/1/2018 12:44,29046.0,111467.0,,9.47811e+17,Iran is failing at every level despite the ter...,Iran is failing at every level despite the ter...
2,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...,1/1/2018 13:37,16884.0,114754.0,,9.47824e+17,Will be leaving Florida for Washington (D.C.) ...,Will be leaving Florida for Washington DC toda...
3,Twitter for iPhone,The people of Iran are finally acting against ...,1/2/2018 12:09,28227.0,105965.0,,9.48164e+17,The people of Iran are finally acting against ...,The people of Iran are finally acting against ...
4,Twitter for iPhone,Crooked Hillary Clinton’s top aid Huma Abedin ...,1/2/2018 12:48,37561.0,130933.0,,9.48174e+17,Crooked Hillary Clinton’s top aid Huma Abedin ...,Crooked Hillary Clintons top aid Huma Abedin h...


In [72]:
tweets_df['tweetdate'] = pd.to_datetime(tweets_df['created_at']).dt.date
tweets_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,tidy_tweets,absolute_tidy_tweets,tweetdate
0,Twitter for iPhone,The United States has foolishly given Pakistan...,1/1/2018 12:12,101056.0,304676.0,,9.47803e+17,The United States has foolishly given Pakistan...,The United States has foolishly given Pakistan...,2018-01-01
1,Twitter for iPhone,Iran is failing at every level despite the ter...,1/1/2018 12:44,29046.0,111467.0,,9.47811e+17,Iran is failing at every level despite the ter...,Iran is failing at every level despite the ter...,2018-01-01
2,Twitter for iPhone,Will be leaving Florida for Washington (D.C.) ...,1/1/2018 13:37,16884.0,114754.0,,9.47824e+17,Will be leaving Florida for Washington (D.C.) ...,Will be leaving Florida for Washington DC toda...,2018-01-01
3,Twitter for iPhone,The people of Iran are finally acting against ...,1/2/2018 12:09,28227.0,105965.0,,9.48164e+17,The people of Iran are finally acting against ...,The people of Iran are finally acting against ...,2018-01-02
4,Twitter for iPhone,Crooked Hillary Clinton’s top aid Huma Abedin ...,1/2/2018 12:48,37561.0,130933.0,,9.48174e+17,Crooked Hillary Clinton’s top aid Huma Abedin ...,Crooked Hillary Clintons top aid Huma Abedin h...,2018-01-02


In [52]:
tweets_df.to_csv('CleanTrumpTweets.csv', index = False)