In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta, date
from textblob import TextBlob
from twitterscraper import query_tweets
import datetime as dt
import requests
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
import nltk
stopwords = stopwords.words('english')


In [None]:
#name of the columns from the received dataframe from the twitterscrapper
cols =['screen_name', 'username', 'user_id', 'tweet_id', 'tweet_url',
       'timestamp', 'timestamp_epochs', 'text', 'text_html', 'links',
       'hashtags', 'has_media', 'img_urls', 'video_url', 'likes', 'retweets',
       'replies', 'is_replied', 'is_reply_to', 'parent_tweet_id',
       'reply_to_users']
companies = ['amazon','nike', 'netflix', 'tesla', 'starbucks', 'yelp']

In [None]:
#collecting the tweets from a keyword, in a range of days,collecting at least 1.000, in english
def tweets_dataframe(company,begin_date, end_date, limit = 1000, lang='en'):
    total_tweets = query_tweets(company,begindate = begin_date, enddate = end_date, limit = limit, lang = lang)
    df = pd.DataFrame(t.__dict__ for t in total_tweets)
    return df

In [None]:
#support function to iterate through a date range
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [None]:
#collecting daily tweets in a range of time from a company. Combining the tweets dataframe function with the daterange to have them daily.
#the csv is saved by appending the new tweets at the end of the csv. Therefore we can stop the kernel at anytime without losing any data.
#The appending its done because the function takes hours.

def appending_tweets(company, begin_date, end_date):
    tweet_cols = pd.DataFrame(columns = cols).to_csv(f'{company}_tweets.csv')
    for single_date in daterange(begin_date, end_date):
        tweets = tweets_dataframe(company, single_date, single_date + timedelta(1))
        tweets.to_csv(f'../02 CSV_files/trends/tweets/{company}_tweets.csv', mode='a', header=False)

In [None]:
#Applying the appending_tweets function to a list of companies, with a beging date and an end date
def gathering_companies_tweets(companies_list, begin_year, begin_month, begin_day, end_year, end_month, end_day):
    begin_date = dt.date(begin_year, begin_month, begin_day)
    end_date = dt.date( end_year, end_month, end_day+ 1)
    for company in companies_list:
        appending_tweets(company, begin_date, end_date)

In [None]:
gathering_companies_tweets(companies, 2019,1,1,2020,1,1)

In [None]:
#preparing the data from the tweets for doing a Sentiment analysis on them
#only keeping letters and taking out webpages
def clean_up(s):
     return re.sub("http\S+|[^a-zA-Z]", " ", s.lower())

#tokenizing the data
def tokenize(s):
    return nltk.word_tokenize(s)

#stemming and lemmatizing the data
def stem_and_lemmatize(l):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    return [lemmatizer.lemmatize(stemmer.stem(word)) for word in l]

#removing the stopwords
def remove_stopwords(l):
    return [word for word in l if word not in stopwords]

#applying all the previous functions to a text
def bag_of_words(string):
    string = clean_up(string)
    string = tokenize(string)
    string = stem_and_lemmatize(string)
    string = remove_stopwords(string)
    return string

In [None]:
#function to get the subjectivity and polarity of a text. The score is to say if its positive or negative
def get_subjectivity(tweet):
    return TextBlob(str(tweet)).sentiment.subjectivity
def get_polarity(tweet):
    return TextBlob(str(tweet)).sentiment.polarity
def get_analysis(score):
    if score < 0:
          return 'Negative'
    elif score == 0:
          return 'Neutral'
    else:
          return 'Positive'

In [None]:
# Applying the previously defined functions to analysis the tweets analysis. Cleaning the text and given a score to each tweet
# tweet relevance score is to weigth each tweets sentiment according to the number of likes and tweets it has.
def get_tweets_analysis(df):
    df['processed_text'] = df['text'].apply(bag_of_words)
    print('bag of words: done')
    df['subjectivity'] = df['processed_text'].apply(get_subjectivity)
    print('subjectivity: done')
    df['polarity'] = df['processed_text'].apply(get_polarity)
    print('polarity: done')
    print('starting analysis')
    df['analysis'] = df['polarity'].apply(get_analysis)
    print('finished analysis...')
    df['tweet_relevance'] = (df['polarity']*(1+df['likes'])*(1+df['retweets']))
    print('last effort')
    df = df[['timestamp','likes', 'retweets', 'subjectivity','polarity','analysis', 'tweet_relevance','company']]
    print('enjoy!')
    return df

In [None]:
#reading the previously gathered tweets, cleaning & analysing them and saving all of them into one csv file.
def analysing_tweets(companies_list):
    for company in companies_list:
        total_tweets = pd.DataFrame(columns =['timestamp','likes', 'retweets', 'subjectivity','polarity','analysis', 'tweet_relevance'])
        df_company = pd.read_csv(f'../02 CSV_files/tweets/{company}_tweets.csv')
        company = get_tweets_analysis(df_company)
        company['name'] = company
        total_tweets = total_tweets.append(company)
        total_tweets.drop('isPartial', axis = 1, inplace = True)
        total_tweets.to_csv(f'../02 CSV_files/tweets/cleaned_tweets.csv', mode='a', header=False)

In [None]:
def companies_trends_peaks(companies_list):
    hourly_trends = pd.read_csv('../02 CSV_files/trends/hourly_peaks.csv')
    companies_df = {}
    trends_df = pd.DataFrame()
    for company in companies_list:
        #collecting each companies all time peaks in google trends (gathered in the company trends jupyter notebook)
        #keeping only the date and the trend value
        companies_df[company] = hourly_trends[['date', company]]
        companies_df[company] = companies_df[company].dropna()
        #creating and extra column with the companies name
        companies_df[company]['name'] = company
        #renaming the columns with the trends value (which was the company name) to trends
        companies_df[company] = companies_df[company].rename(columns = {company:'trends'})
        #appending the company peaks to the complete dataframe
        trends_df = trends_df.append(companies_df[company])
    #converting the columsn to datatypes
    trends_df['date'] = pd.to_datetime(trends_df['date']).dt.date
    #grouping by company and date because the peaks implied the week were the peak was achieved and the previous one
    trends_df = trends_df.groupby(['date','name']).agg({'trends' : 'mean'}).reset_index()
    #returning the dataframe with the companies highest peaks per day
    return trends_df

In [None]:
companies_peaks = companies_trends_peaks(companies)
#iterating through the companies peaks dataframe in order to gather each of the days peaks tweets to see the sentiment analysis.
#the appending_tweets function already saves them in their companies csv file
def collecting_tweets_dataframe(df):
    for index, row in df.iterrows():
        appending_tweets(row['name'], row['date'], row['date'] + timedelta(days = 1))

collecting_tweets_dataframe(companies_peaks)

In [None]:
#reading all the saved tweets and unifiying them into one dataframe to rule them all. Saving it as gathered tweets and returning it in case you want to use it
def creating_unified_df(companies_list):
    tweets = pd.DataFrame()
    for company in companies_list:
        new_company = pd.read_csv(f'../02 CSV_files/tweets/{company}_tweets.csv', index_col = 0)
        new_company['company'] = company
        tweets = tweets.append(new_company)
    tweets.to_csv('../02 CSV_files/tweets/gathered_tweets.csv')
    return tweets

In [None]:
tweets = creating_unified_df(companies)
analysed_tweets = get_tweets_analysis(tweets)

In [None]:
#analysed_tweets.to_csv('02 CSV_files/tweets/analysed_tweets_3.csv')

In [None]:
#preparing the final tweets dataframe, after gathering and analysing all the tweets
def prepare_final_tweets_df(file_name):
    #reading the file from the tweets file
    tweets = pd.read_csv(f'../02 CSV_files/tweets/{file_name}.csv', index_col = 0)
    #changing the name to be consistent with the other gathered datasets
    tweets.rename(columns = {'timestamp':'date', 'company':'name'}, inplace = True)
    #taking out the neutral tweets
    tweets = tweets.loc[lambda x: x['analysis'] != 'Neutral']
    #converting the date column into date type
    tweets.date = pd.to_datetime(tweets.date).dt.date
    #assigning a score to the tweet. 1 if its positive, -1 if its negative
    tweets['score'] = tweets['polarity'].apply(lambda x : 1 if x > 0 else -1)
    #keeping only 3 columns. Date, company and the score
    tweets = tweets[['date', 'name', 'score']]
    #groping by date and company, having a mean of the score columns. Therefore, we have for each date thousands of analysed tweets about each company every day
    tweets_df = tweets.groupby(['date', 'name']).agg({'score':'mean'}).reset_index()
    return tweets_df


In [None]:
prepared_tweets = prepare_final_tweets_df('analysed_tweets')

In [None]:
#creating columns referencing previous rows, in order to see the evolution of the score
def shift_n_rows(df,col_name,n_rows):
    for col in range(1,n_rows+1):
        df[f'{col_name}_day_{col}'] = df[col_name].shift(col)
    return df

In [None]:
#applying the shift function individualy to each company and combining it afterwards into one dataframe
def create_shifted_df(companies_list, df ,col_name, n_of_shifts):
    shift = {}
    shifted = pd.DataFrame()
    for company in companies:
        new_df = df[df['name'] == company]
        new_df = shift_n_rows(new_df, col_name, n_of_shifts)
        shift[company] = new_df
        shifted = shifted.append(new_df)
    shifted.dropna(axis = 0, inplace = True)
    shifted.date = pd.to_datetime(shifted.date)
    shifted.to_csv('../02 CSV_files/csv_finals/final_tweets.csv')
    return shifted

In [None]:
tweets_shifted = create_shifted_df(companies, prepared_tweets, 'score', 14)

In [None]:
tweets_shifted