In [2]:
import Exporter
import tweepy
import quandl
import googletrans
import os
import pandas as pd
import sys
import numpy as np
import math
import langdetect

In [3]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
if not os.path.exists(data_path):
    os.mkdir(data_path)

In [4]:
def get_stock_market_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    """
        @param: ticker: the ticker symbol
        @param: start_date: format (yyyy-mm-dd) - the date from which the stock prices needs to be pulled
        @param: end_date: format (yyyy-mm-dd) - the date uptil which the stock prices needs to be pulled
        @return: dataframe which contains the stock prices of the given ticker.
    """

    # get the Apple Stock Market data using quandle API
    
    # set quandl API key
    quandl.ApiConfig.api_key = 'JvD7NPDHJRwBcWeZK6qr'

    # get the End of Day data for Apple Stocks for one year
    ticker_stock_data = quandl.get("EOD/{}".format(ticker), start_date=start_date, end_date=end_date)

    return ticker_stock_data

In [5]:
def get_twitter_data(search_term: str, start_date: str, end_date: str) -> pd.DataFrame:
    """
        @param: search_term: the term to search for top tweets on the twitter.
        @param: start_date: the start date from which the top tweets are to be searched and fetched.
        @param: end_date: the end date uptil which the top tweets are to be searched and fetched.
        @return: dataframe which contains the top tweets
    """
    os.system('python Exporter.py --querysearch "#{}" --since {} --until {} --toptweets --output "output_got.csv"'.format(search_term, start_date, end_date))
    twitter_data = pd.read_csv('output_got.csv', sep=';', error_bad_lines=False)
    os.remove(os.path.join(os.getcwd(), 'output_got.csv'))
    return twitter_data

In [6]:
def get_only_english_tweets(input_df: pd.DataFrame) -> pd.DataFrame:
    """
        @param: input_df: input dataframe, which contains the top tweets downloaded in various languages.
        @return: dataframe which contains only the english tweets.
    """
    def get_language(value):
        try:
            language = langdetect.detect(value)
            return language == 'en'
        except Exception as e:
            return False
    new_df = input_df.apply(lambda x: get_language(x['text']), axis=1)
    input_df_en = input_df.loc[new_df]
    return input_df_en

In [7]:
ticker_symbol = 'AAPL'

twitter_search_query_term = 'Apple'

start_date = '2019-03-15'
end_date = '2019-04-15'

print('Getting Stock Market Data for {}'.format(ticker_symbol))
stock_data = get_stock_market_data(ticker_symbol, start_date, end_date)
print('Received Stock Market data for {}'.format(ticker_symbol))

print('Getting Twitter Data for search query {}'.format(twitter_search_query_term))
twitter_data = get_twitter_data(twitter_search_query_term, start_date, end_date)
print('Received Twiiter Data for search query {}'.format(twitter_search_query_term))

print('Getting only english tweets')
english_twitter_data = get_only_english_tweets(twitter_data)
print('Received only english tweets')

stock_data_file_path = os.path.join(data_path, 'stock_data_{}_{}_{}.csv'.format(ticker_symbol, start_date, end_date))
twitter_data_file_path = os.path.join(data_path, 'twitter_data_english_{}_{}_{}.csv'.format(twitter_search_query_term, start_date, end_date))

stock_data.to_csv(stock_data_file_path)
print('Stock market data saved to {}'.format(stock_data_file_path))

english_twitter_data.to_csv(twitter_data_file_path, index=False)
print('Twitter Data saved to {}'.format(twitter_data_file_path))

Getting Stock Market Data for AAPL
Received Stock Market data for AAPL
Getting Twitter Data for search query Apple


b'Skipping line 2251: expected 10 fields, saw 11\nSkipping line 5195: expected 10 fields, saw 11\nSkipping line 16050: expected 10 fields, saw 11\nSkipping line 17255: expected 10 fields, saw 11\nSkipping line 25915: expected 10 fields, saw 11\nSkipping line 26884: expected 10 fields, saw 11\nSkipping line 30927: expected 10 fields, saw 11\nSkipping line 36808: expected 10 fields, saw 11\nSkipping line 41760: expected 10 fields, saw 11\nSkipping line 46578: expected 10 fields, saw 12\nSkipping line 52871: expected 10 fields, saw 12\nSkipping line 54690: expected 10 fields, saw 11\nSkipping line 54899: expected 10 fields, saw 12\nSkipping line 58030: expected 10 fields, saw 11\nSkipping line 62953: expected 10 fields, saw 12\nSkipping line 64527: expected 10 fields, saw 11\n'
b'Skipping line 65609: expected 10 fields, saw 11\nSkipping line 65620: expected 10 fields, saw 11\nSkipping line 65640: expected 10 fields, saw 11\nSkipping line 65655: expected 10 fields, saw 11\nSkipping line 69

Received Twiiter Data for search query Apple
Getting only english tweets
Received only english tweets
Stock market data saved to C:\Users\Shadow\Desktop\FYP-master\FYP-master\Stock-Price-Prediction-master\data\stock_data_AAPL_2019-03-15_2019-04-15.csv
Twitter Data saved to C:\Users\Shadow\Desktop\FYP-master\FYP-master\Stock-Price-Prediction-master\data\twitter_data_english_Apple_2019-03-15_2019-04-15.csv
