## Feature Extraction and  Data Preprocessing 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df=pd.read_csv("Merged_Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average
0,0,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
1,1,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
2,2,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167
3,3,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167
4,4,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826


In [6]:
df=pd.read_csv("Data_with_content.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Content,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average
0,0,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
1,1,Old IRA very overweight in two equities - opin...,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
2,2,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167
3,3,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167
4,4,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826


In [7]:
df.Stock.unique()

array(['AAPL', 'MSFT', 'AMZN', 'TSLA', 'GOOGL'], dtype=object)

In [8]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,Content,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average
0,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
1,Old IRA very overweight in two equities - opin...,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917
2,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167
3,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167
4,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826


### Extracting Subjectivity of each post

In [10]:
from textblob import TextBlob

# Add sentiment features
def extract_sentiment(content):
    analysis = TextBlob(content)
    return pd.Series({
        # Sentiment polarity (-1 to 1)
        'subjectivity': analysis.subjectivity  # Subjectivity (0 to 1)
    })

# Apply the function to the Content column
df[ 'subjectivity'] = df['Content'].apply(extract_sentiment)


In [11]:
df.head()

Unnamed: 0,Content,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average,subjectivity
0,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.225
1,Old IRA very overweight in two equities - opin...,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.341667
2,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167,0.225
3,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167,0.225
4,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,0.225


## Topic Modelling on posts using LDA model

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_features=500, stop_words='english')
content_matrix = vectorizer.fit_transform(df['Content'])

#LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
lda_features = lda.fit_transform(content_matrix)
for i in range(lda_features.shape[1]):
    df[f'topic_{i}'] = lda_features[:, i]


In [13]:
df11=df

In [19]:
df1=df11

In [20]:
df2=df11

In [21]:
df3=df11

### Five topics are identified and added to the dataframe

In [14]:
df.head()

Unnamed: 0,Content,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average,subjectivity,topic_0,topic_1,topic_2,topic_3,topic_4
0,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.225,0.969128,0.007718,0.007754,0.007698,0.007703
1,Old IRA very overweight in two equities - opin...,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.341667,0.0799,0.013368,0.471801,0.138068,0.296863
2,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167,0.225,0.969128,0.007716,0.007752,0.007698,0.007706
3,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167,0.225,0.969109,0.007715,0.007753,0.007703,0.00772
4,r/Stocks Daily Discussion & Fundamentals Frida...,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,0.225,0.967903,0.008018,0.008061,0.008006,0.008011


In [15]:
df.drop('Content',axis=1,inplace=True)

In [16]:
df.head()

Unnamed: 0,Stock,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average,subjectivity,topic_0,topic_1,topic_2,topic_3,topic_4
0,AAPL,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.225,0.969128,0.007718,0.007754,0.007698,0.007703
1,AAPL,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.341667,0.0799,0.013368,0.471801,0.138068,0.296863
2,AAPL,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167,0.225,0.969128,0.007716,0.007752,0.007698,0.007706
3,AAPL,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167,0.225,0.969109,0.007715,0.007753,0.007703,0.00772
4,AAPL,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,0.225,0.967903,0.008018,0.008061,0.008006,0.008011


### One-hot encoding Stock Name column

In [22]:
df1 = pd.get_dummies(df1, columns=['Stock'])


In [24]:
df1.head(60)

Unnamed: 0,sentiment_score,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average,...,topic_0,topic_1,topic_2,topic_3,topic_4,Stock_AAPL,Stock_AMZN,Stock_GOOGL,Stock_MSFT,Stock_TSLA
0,0.104167,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,...,0.969128,0.007718,0.007754,0.007698,0.007703,1,0,0,0,0
1,0.141667,2024-11-29,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,...,0.0799,0.013368,0.471801,0.138068,0.296863,1,0,0,0,0
2,0.104167,2024-11-22,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167,...,0.969128,0.007716,0.007752,0.007698,0.007706,1,0,0,0,0
3,0.104167,2024-11-15,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167,...,0.969109,0.007715,0.007753,0.007703,0.00772,1,0,0,0,0
4,0.104167,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,...,0.967903,0.008018,0.008061,0.008006,0.008011,1,0,0,0,0
5,0.123485,2024-11-08,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,...,0.012603,0.012624,0.012548,0.012574,0.94965,1,0,0,0,0
6,0.0,2024-11-02,225.559287,227.770589,224.814484,225.886261,45065775.0,0.1875,0.0,0.131754,...,0.016667,0.016771,0.016679,0.932946,0.016937,1,0,0,0,0
7,0.148889,2024-11-02,223.948576,226.881174,223.218965,224.812515,51802750.0,0.125,0.0,0.131754,...,0.007417,0.00762,0.007439,0.9699,0.007624,1,0,0,0,0
8,0.246372,2024-11-02,222.337865,225.991759,221.623445,223.73877,58539725.0,0.0625,0.0,0.131754,...,0.022223,0.185656,0.022389,0.022834,0.746897,1,0,0,0,0
9,0.104167,2024-11-01,220.727153,225.102345,220.027926,222.665024,65276700.0,0.0,0.0,0.104167,...,0.967903,0.008018,0.008061,0.008006,0.008011,1,0,0,0,0


In [25]:
df1.to_csv("preprocessed_data.csv",index=False)

### Formatting the Date Time column to use in our ML model effectively

In [28]:
df1['Date'] = pd.to_datetime(df1['Date'])
# Extracting from Date time column

df1['Year'] = df1['Date'].dt.year
df1['Month'] = df1['Date'].dt.month
df1['Day'] = df1['Date'].dt.day
df1['Weekday'] = df1['Date'].dt.weekday  # 0 = Monday, 6 = Sunday
df1['Quarter'] = df1['Date'].dt.quarter
df1 = df1.drop(columns=['Date'])
print(df1.head())


   sentiment_score        Open        High         Low       Close  \
0         0.104167  234.809998  237.809998  233.970001  237.330002   
1         0.141667  234.809998  237.809998  233.970001  237.330002   
2         0.104167  228.059998  230.720001  228.059998  229.869995   
3         0.104167  226.399994  226.919998  224.270004  225.000000   
4         0.104167  227.169998  228.660004  226.410004  226.960007   

       Volume  Dividends  Stock Splits  sentiment_score_average  subjectivity  \
0  28481400.0       0.00           0.0                 0.122917      0.225000   
1  28481400.0       0.00           0.0                 0.122917      0.341667   
2  38168300.0       0.00           0.0                 0.104167      0.225000   
3  47923700.0       0.00           0.0                 0.104167      0.225000   
4  38328800.0       0.25           0.0                 0.113826      0.225000   

   ...  Stock_AAPL  Stock_AMZN  Stock_GOOGL  Stock_MSFT  Stock_TSLA  Year  \
0  ...         

In [33]:
df1.to_csv("Preprocessed_final_data.csv",index=False)

### Preprocessed Final Data

In [32]:
df1.head()

Unnamed: 0,sentiment_score,Open,High,Low,Close,Volume,Dividends,Stock Splits,sentiment_score_average,subjectivity,...,Stock_AAPL,Stock_AMZN,Stock_GOOGL,Stock_MSFT,Stock_TSLA,Year,Month,Day,Weekday,Quarter
0,0.104167,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.225,...,1,0,0,0,0,2024,11,29,4,4
1,0.141667,234.809998,237.809998,233.970001,237.330002,28481400.0,0.0,0.0,0.122917,0.341667,...,1,0,0,0,0,2024,11,29,4,4
2,0.104167,228.059998,230.720001,228.059998,229.869995,38168300.0,0.0,0.0,0.104167,0.225,...,1,0,0,0,0,2024,11,22,4,4
3,0.104167,226.399994,226.919998,224.270004,225.0,47923700.0,0.0,0.0,0.104167,0.225,...,1,0,0,0,0,2024,11,15,4,4
4,0.104167,227.169998,228.660004,226.410004,226.960007,38328800.0,0.25,0.0,0.113826,0.225,...,1,0,0,0,0,2024,11,8,4,4
