## Mock data taken from Kaggle
#### There are two data files: GME_stock.csv which contains the stock price the other file is also from Kaggle which contains "positive" and "negative" sentiments. I thought this would be a good data to do some practice run.

In [25]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
import numpy as np

In [26]:
# Load gamestop data
file_path = "Mock_data/GME_stock.csv"
df_GME_price = pd.read_csv(file_path)
df_GME_price.head(10)

Unnamed: 0,date,open_price,high_price,low_price,close_price,volume,adjclose_price
0,2021-01-28,265.0,483.0,112.25,193.600006,58815800.0,193.600006
1,2021-01-27,354.829987,380.0,249.0,347.51001,93396700.0,347.51001
2,2021-01-26,88.559998,150.0,80.199997,147.979996,178588000.0,147.979996
3,2021-01-25,96.730003,159.179993,61.130001,76.790001,177874000.0,76.790001
4,2021-01-22,42.59,76.760002,42.32,65.010002,196784300.0,65.010002
5,2021-01-21,39.23,44.75,37.0,43.029999,57079800.0,43.029999
6,2021-01-20,37.369999,41.189999,36.060001,39.119999,33471800.0,39.119999
7,2021-01-19,41.549999,45.52,36.639999,39.360001,74721900.0,39.360001
8,2021-01-15,38.490002,40.75,34.009998,35.5,46752200.0,35.5
9,2021-01-14,38.09,43.060001,33.049999,39.91,93717400.0,39.91


In [27]:
# Check data types
df_GME_price.dtypes

date               object
open_price        float64
high_price        float64
low_price         float64
close_price       float64
volume            float64
adjclose_price    float64
dtype: object

In [28]:
# Load Reddit data
file_path1 = "Mock_data/GME sentiment analysis reddit.csv"
df_reddit = pd.read_csv(file_path1)
df_reddit.head()

Unnamed: 0.1,Unnamed: 0,title,body,timestamp,subjectivity,polarity,analysis
0,0,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,,28-01-2021 21:28,0.480303,-0.238826,Negative
1,1,"Not to distract from GME, just thought our AMC...",,28-01-2021 21:26,0.25,0.25,Positive
2,2,Currently Holding AMC and NOK - Is it retarded...,,28-01-2021 21:19,0.6,-0.4,Negative
3,3,GME Premarket ???�?� Musk approved ???????????...,,28-01-2021 21:17,0.0,0.0,Neutral
4,4,"Once you're done with GME - $AG and $SLV, the ...",You guys are champs. GME... who would have tho...,28-01-2021 21:17,0.3,0.0,Neutral


In [29]:
# Check data types
df_reddit.dtypes

Unnamed: 0        int64
title            object
body             object
timestamp        object
subjectivity    float64
polarity        float64
analysis         object
dtype: object

### Preprocess the data

In [30]:
# Remove the timestamp from the date of the reddit data
df_reddit["timestamp"] = pd.to_datetime(df_reddit['timestamp'], dayfirst=True)
df_reddit["new_date_column"] = df_reddit["timestamp"].dt.date
df_reddit.head()

Unnamed: 0.1,Unnamed: 0,title,body,timestamp,subjectivity,polarity,analysis,new_date_column
0,0,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,,2021-01-28 21:28:00,0.480303,-0.238826,Negative,2021-01-28
1,1,"Not to distract from GME, just thought our AMC...",,2021-01-28 21:26:00,0.25,0.25,Positive,2021-01-28
2,2,Currently Holding AMC and NOK - Is it retarded...,,2021-01-28 21:19:00,0.6,-0.4,Negative,2021-01-28
3,3,GME Premarket ???�?� Musk approved ???????????...,,2021-01-28 21:17:00,0.0,0.0,Neutral,2021-01-28
4,4,"Once you're done with GME - $AG and $SLV, the ...",You guys are champs. GME... who would have tho...,2021-01-28 21:17:00,0.3,0.0,Neutral,2021-01-28


In [31]:
# Drop columns not needed. 
reddit_clean_df = df_reddit.drop(columns=["Unnamed: 0", "title", "body", "timestamp"], axis=True)
reddit_clean_df.head()

Unnamed: 0,subjectivity,polarity,analysis,new_date_column
0,0.480303,-0.238826,Negative,2021-01-28
1,0.25,0.25,Positive,2021-01-28
2,0.6,-0.4,Negative,2021-01-28
3,0.0,0.0,Neutral,2021-01-28
4,0.3,0.0,Neutral,2021-01-28


In [32]:
df_reddit["new_date_column"].value_counts()

2021-01-29    2940
2021-02-06     619
2021-02-05     550
2021-02-03     536
2021-02-02     310
              ... 
2021-04-03       3
2021-02-16       2
2021-04-04       2
2021-02-15       2
2021-04-05       2
Name: new_date_column, Length: 68, dtype: int64

In [33]:
reddit_clean_df["date"] = reddit_clean_df["new_date_column"]
reddit_clean_df1=reddit_clean_df.drop(columns=["new_date_column","analysis"])
reddit_clean_df1.head()

Unnamed: 0,subjectivity,polarity,date
0,0.480303,-0.238826,2021-01-28
1,0.25,0.25,2021-01-28
2,0.6,-0.4,2021-01-28
3,0.0,0.0,2021-01-28
4,0.3,0.0,2021-01-28


In [34]:
reddit_clean_df2 = reddit_clean_df1.groupby('date')['subjectivity','polarity'].sum()
reddit_clean_df2['date']=reddit_clean_df2.index
reddit_clean_df2 = reddit_clean_df2.reset_index(drop=True)
reddit_clean_df2

  """Entry point for launching an IPython kernel.


Unnamed: 0,subjectivity,polarity,date
0,65.905308,8.809233,2021-01-28
1,734.038456,72.789893,2021-01-29
2,79.178550,5.050014,2021-01-30
3,75.969506,9.969900,2021-01-31
4,66.381158,5.184502,2021-02-01
...,...,...,...
63,1.722222,-0.288889,2021-04-01
64,3.094444,0.473394,2021-04-02
65,0.854167,0.715625,2021-04-03
66,0.283333,0.250000,2021-04-04


#### Exporting the Data to Mongo

In [12]:
import pymongo
import json

In [13]:
client = pymongo.MongoClient("mongodb://localhost:27017")

In [35]:
Sentiment_Mongo = reddit_clean_df2.to_dict(orient = "record")



In [36]:
Sentiment_Mongo

[{'subjectivity': 65.90530844199998,
  'polarity': 8.80923299,
  'date': datetime.date(2021, 1, 28)},
 {'subjectivity': 734.0384556980009,
  'polarity': 72.78989312599997,
  'date': datetime.date(2021, 1, 29)},
 {'subjectivity': 79.17855008300002,
  'polarity': 5.050013679000001,
  'date': datetime.date(2021, 1, 30)},
 {'subjectivity': 75.96950636999996,
  'polarity': 9.969899514,
  'date': datetime.date(2021, 1, 31)},
 {'subjectivity': 66.38115764299997,
  'polarity': 5.184502255999998,
  'date': datetime.date(2021, 2, 1)},
 {'subjectivity': 95.6606908390001,
  'polarity': 19.768387311000005,
  'date': datetime.date(2021, 2, 2)},
 {'subjectivity': 150.26341510500006,
  'polarity': 24.998924674000016,
  'date': datetime.date(2021, 2, 3)},
 {'subjectivity': 69.467806173,
  'polarity': 5.920135521999999,
  'date': datetime.date(2021, 2, 4)},
 {'subjectivity': 151.6529247160001,
  'polarity': 15.295492016,
  'date': datetime.date(2021, 2, 5)},
 {'subjectivity': 170.45040314400003,
  'pola

In [16]:
db = client("Project_Gamestop")

TypeError: 'MongoClient' object is not callable

In [None]:
print(db)

In [None]:
db.SentGME_Data.insert_many(Sentiment_Mongo)

In [18]:
import pandas as pd
import yfinance as yf
from yahoofinancials import YahooFinancials

In [47]:
start_time = '2020-12-1'
end_time = '2021-4-30'

# Remember to input Ticker symbol in all caps
GME_df = yf.download('GME', 
                      start_time, 
                      end_time, 
                      progress=False)

GME_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-01,17.110001,17.4,15.76,15.8,15.8,12653900
2020-12-02,15.7,16.68,15.38,16.58,16.58,7883400
2020-12-03,16.48,16.639999,15.87,16.120001,16.120001,6295000
2020-12-04,16.299999,17.290001,16.26,16.9,16.9,8972700
2020-12-07,17.0,17.5,16.219999,16.35,16.35,7386300


In [48]:
GME_df.reset_index(level=0, inplace=True)
GME_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-12-01,17.110001,17.4,15.76,15.8,15.8,12653900
1,2020-12-02,15.7,16.68,15.38,16.58,16.58,7883400
2,2020-12-03,16.48,16.639999,15.87,16.120001,16.120001,6295000
3,2020-12-04,16.299999,17.290001,16.26,16.9,16.9,8972700
4,2020-12-07,17.0,17.5,16.219999,16.35,16.35,7386300


In [49]:
GME_df["date"] = pd.to_datetime(GME_df['Date'])
GME_df["date"] = GME_df["date"].dt.date
GME_df=GME_df.drop(columns=["Date"])
GME_df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,date
0,17.110001,17.4,15.76,15.8,15.8,12653900,2020-12-01
1,15.7,16.68,15.38,16.58,16.58,7883400,2020-12-02
2,16.48,16.639999,15.87,16.120001,16.120001,6295000,2020-12-03
3,16.299999,17.290001,16.26,16.9,16.9,8972700,2020-12-04
4,17.0,17.5,16.219999,16.35,16.35,7386300,2020-12-07


In [50]:
GME_Mongo = GME_df.to_dict(orient = "record")
GME_Mongo



[{'Open': 17.110000610351562,
  'High': 17.399999618530273,
  'Low': 15.760000228881836,
  'Close': 15.800000190734863,
  'Adj Close': 15.800000190734863,
  'Volume': 12653900,
  'date': datetime.date(2020, 12, 1)},
 {'Open': 15.699999809265137,
  'High': 16.68000030517578,
  'Low': 15.380000114440918,
  'Close': 16.579999923706055,
  'Adj Close': 16.579999923706055,
  'Volume': 7883400,
  'date': datetime.date(2020, 12, 2)},
 {'Open': 16.479999542236328,
  'High': 16.639999389648438,
  'Low': 15.869999885559082,
  'Close': 16.1200008392334,
  'Adj Close': 16.1200008392334,
  'Volume': 6295000,
  'date': datetime.date(2020, 12, 3)},
 {'Open': 16.299999237060547,
  'High': 17.290000915527344,
  'Low': 16.260000228881836,
  'Close': 16.899999618530273,
  'Adj Close': 16.899999618530273,
  'Volume': 8972700,
  'date': datetime.date(2020, 12, 4)},
 {'Open': 17.0,
  'High': 17.5,
  'Low': 16.219999313354492,
  'Close': 16.350000381469727,
  'Adj Close': 16.350000381469727,
  'Volume': 73863

In [None]:
db.GME_Data.insert_many(GME_Mongo)