In [None]:
# Packages
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import random
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import warnings
import datetime as dt
import pandas_ta
import zipfile
warnings.filterwarnings('ignore')

## Load in Twitter Sentiment Data

In [None]:
sentiment_df = pd.read_csv('sentiment_data.csv')

In [None]:
sentiment_df

In [None]:
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
sentiment_df = sentiment_df.set_index(['date','symbol'])
sentiment_df['engagement_ratio'] = sentiment_df['twitterComments'] / sentiment_df['twitterLikes']

In [None]:
sentiment_df = sentiment_df[(sentiment_df['twitterLikes'] > 20)&(sentiment_df['twitterComments']>10)]

In [None]:
key_metric = "engagement_ratio"

aggregate_df = sentiment_df.reset_index('symbol').groupby([pd.Grouper(freq = 'M'), 'symbol'])[[key_metric]].mean()
aggregate_df['rank'] = aggregate_df.groupby(level = 0)[key_metric].transform(lambda x: x.rank(ascending=False))

In [None]:
aggregate_df

### Top 5 Stocks by Engagement for each month

In [None]:
top5_df = aggregate_df[aggregate_df['rank'] <= 5]
top5_df = top5_df.reset_index(level = 1)

In [None]:
top5_df.index = top5_df.index + pd.DateOffset(1)
top5_df = top5_df.reset_index().set_index(['date','symbol'])

### Extract Stocks to create Portfolio

In [None]:
dates = top5_df.index.get_level_values('date').unique().tolist()
date_dict = {}

for date in dates:
    date_dict[date.strftime('%Y-%m-%d')] = top5_df.xs(date,level=0).index.tolist()
    
date_dict

In [None]:
stocks = top5_df.index.get_level_values('symbol').unique().tolist()
start_date = "2016-01-01"
end_date = "2024-03-01"
prices_df = yf.download(stocks, start = start_date, end = end_date)

In [None]:
returns_df = np.log(prices_df['Adj Close']).diff().dropna()

In [None]:
returns_df

In [None]:
portfolio_df = pd.DataFrame()

for start_date in date_dict.keys():
    end_date = (pd.to_datetime(start_date) + pd.offsets.MonthEnd(0)).strftime('%Y-%m-%d')
    companies = date_dict[start_date]
    optimize_df = returns_df[start_date:end_date][companies].mean(axis=1).to_frame('portfolio-return')
    portfolio_df = pd.concat([portfolio_df, optimize_df],axis=0)
portfolio_df

### Compare to QQQ

In [None]:
qqq_df = yf.download('QQQ', start = '2016-01-01', end = dt.date.today())
qqq_returns_df = np.log(qqq_df[['Adj Close']]).diff().dropna().rename({'Adj Close':'QQQ Buy&Hold'}, axis=1)
comp_df = portfolio_df.merge(qqq_returns_df,left_index = True, right_index=True)

In [None]:
comp_df

In [None]:
cum_returns_df = np.exp(np.log1p(comp_df).cumsum()) - 1

sns.lineplot(cum_returns_df[:'2024-01-01'])
plt.xticks(rotation=45)