# Data Collection

## Collecting historical stock data using yfinance library for the stocks: AAPL, MSFT,AMZN,TSLA and GOOGLE

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import plotly.graph_objects as go

#5 different stocks taken
stonks = ["AAPL", "MSFT", "AMZN", "TSLA", "GOOGL"]
hists = {}


for s in stonks:
    tkr = yf.Ticker(s)
    history = tkr.history(period="2y")
    hists[s] = history


stock_data = []
for stock, data in hists.items():
    data = data.reset_index()  
    data["Stock"] = stock 
    stock_data.append(data)


combined_df = pd.concat(stock_data)

#  DataFrame with historical Stock Data
print(combined_df.head())
combined_df.to_csv("Stocks_Data.csv")


                       Date        Open        High         Low       Close  \
0 2022-11-30 00:00:00-05:00  139.928537  147.172370  139.087391  146.489548   
1 2022-12-01 00:00:00-05:00  146.667651  147.578075  145.084296  146.766602   
2 2022-12-02 00:00:00-05:00  144.441081  146.459845  144.134294  146.271820   
3 2022-12-05 00:00:00-05:00  146.232231  149.349445  144.253044  145.104095   
4 2022-12-06 00:00:00-05:00  145.539503  145.767105  140.443088  141.422791   

      Volume  Dividends  Stock Splits Stock  
0  111380900        0.0           0.0  AAPL  
1   71250400        0.0           0.0  AAPL  
2   65447400        0.0           0.0  AAPL  
3   68826400        0.0           0.0  AAPL  
4   64727200        0.0           0.0  AAPL  


In [2]:
df=combined_df

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Stock
0,2022-11-30 00:00:00-05:00,139.928537,147.17237,139.087391,146.489548,111380900,0.0,0.0,AAPL
1,2022-12-01 00:00:00-05:00,146.667651,147.578075,145.084296,146.766602,71250400,0.0,0.0,AAPL
2,2022-12-02 00:00:00-05:00,144.441081,146.459845,144.134294,146.27182,65447400,0.0,0.0,AAPL
3,2022-12-05 00:00:00-05:00,146.232231,149.349445,144.253044,145.104095,68826400,0.0,0.0,AAPL
4,2022-12-06 00:00:00-05:00,145.539503,145.767105,140.443088,141.422791,64727200,0.0,0.0,AAPL


In [4]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Stock'],
      dtype='object')

# Web Scraping

## Using praw package to scrape relevant reddit posts about  desired stocks during the past 2 years 

### Authentication

In [7]:
import praw
import time
import pandas as pd


reddit = praw.Reddit
    client_id="HeJO43c9Pqcnz5wyOEC_pg",
    client_secret="qVyVdtW1EiDUJhGEqv5bDErE6XialQ",
    user_agent="StockScraper by hamsatwin",
    username="hamsatwin",
    password="Captainshield"
)

print("Authenticated:", reddit.user.me())



Authenticated: hamsatwin


In [6]:
! pip install praw

Collecting praw
  Using cached praw-7.8.1-py3-none-any.whl (189 kB)
Collecting prawcore<3,>=2.4
  Using cached prawcore-2.4.0-py3-none-any.whl (17 kB)
Collecting update_checker>=0.18
  Using cached update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [8]:

subreddits = ["stocks", "investing"]  # Subreddits to search
keywords = ["AAPL", "MSFT", "AMZN", "TSLA", "GOOGL"] 
two_years_ago = int(time.time()) - (2 * 365 * 24 * 60 * 60) # Chosen Timeframe of past 2 years 
results = []

#  Scraping  Data
print("Starting data collection...")
for keyword in keywords:
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        
        for post in subreddit.search(keyword, sort="new", time_filter="all", limit=1000):  
            if post.created_utc >= two_years_ago: 
                results.append({
                    "Datetime": time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(post.created_utc)),
                    "Content": f"{post.title} - {post.selftext[:200]}...",  # Truncated content
                    "Stock": keyword
                })
               
                time.sleep(0.1)

    print(f"Collected data for {keyword} in r/{subreddit_name}.")


if len(results) < 300:
    print(f"Warning: Only {len(results)} rows collected. Consider expanding search criteria.")


df = pd.DataFrame(results)

csv_file = "reddit_stock_posts.csv"
df.to_csv(csv_file, index=False, encoding="utf-8")
print(f"Data saved to {csv_file}.")


print(df.head())  
print(f"Total rows: {len(df)}")


Starting data collection...
Collected data for AAPL in r/investing.
Collected data for MSFT in r/investing.
Collected data for AMZN in r/investing.
Collected data for TSLA in r/investing.
Collected data for GOOGL in r/investing.
Data saved to reddit_stock_posts.csv.
              Datetime                                            Content  \
0  2024-11-29 10:30:09  r/Stocks Daily Discussion & Fundamentals Frida...   
1  2024-11-22 10:30:12  r/Stocks Daily Discussion & Fundamentals Frida...   
2  2024-11-15 10:30:09  r/Stocks Daily Discussion & Fundamentals Frida...   
3  2024-11-08 10:30:10  r/Stocks Daily Discussion & Fundamentals Frida...   
4  2024-11-02 18:20:38  Globalstar and Apple expanded partnership  - D...   

  Stock  
0  AAPL  
1  AAPL  
2  AAPL  
3  AAPL  
4  AAPL  
Total rows: 1856


##  Scrapped 1856 stocks relevant  reddit posts using reddit API.