In [1]:
import streamlit as st
import pandas as pd
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import OneHotEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import preprocess

In [2]:
def show_distribution(var, name):
    """The functions helps to print out statistical analysis about data and then plots a distribution plot for the data 
    - inputs: It takes in the column to be analysed
    - Returns statistical analyses and visuals for the data"""
    
    sns.set()
    
    #collating the statistical property
    min_val = var.min()
    max_val = var.max()
    mode_val = var.mode()[0]
    median_val = var.median()
    mean_val = var.mean()
    rstd = var.std()
    rvar= np.sqrt(rstd)
    
    print("The statistical values are as follows:\n Minimum value:{:.2f}\n Maximum value: {:.2f}\n Mode: {:.2f} \n Mean: {:.2f} \n Standard Deviation: {:.2f} \n Variance: {:.2f} \n Median: {:.2f}"\
          .format(min_val, max_val, mode_val, mean_val, rstd, rvar, median_val))
    
    #creating a figure with 2 rows and 1 columns where the upperpart is dominated by the histplot and the
    #bottom by the 
    fig, ax = plt.subplots(2,1, figsize=(12,6))
    
    #Creating the histogram
    sns.histplot(var, ax=ax[0])
    ax[0].set_ylabel("Frequency")
    
    #fitting the statiscal lines
    ax[0].axvline(x=min_val, color='gray', linestyle='dashed', linewidth=2)
    ax[0].axvline(x=median_val, color='cyan', linestyle='dashed', linewidth=2)
    ax[0].axvline(x=mode_val, color='red', linestyle='dashed', linewidth=2)
    ax[0].axvline(x=mean_val, color='orange', linestyle='dashed', linewidth=2)
    ax[0].axvline(x=rvar, color='white', linestyle='dashed', linewidth=2)
    ax[0].axvline(x=max_val, color='gray', linestyle='dashed', linewidth=2)
    
    #creating the boxplot
    sns.boxplot(x=var, ax=ax[1])
    ax[1].axvline(x=min_val, color='gray', linestyle='dashed', linewidth=2, label="Minimum Value")
    ax[1].axvline(x=median_val, color='cyan', linestyle='dashed', linewidth=2, label="Median")
    ax[1].axvline(x=mode_val, color='red', linestyle='dashed', linewidth=2, label="Mode")
    ax[1].axvline(x=mean_val, color='orange', linestyle='dashed', linewidth=2, label="Mean")
    ax[1].axvline(x=rvar, color='white', linestyle='dashed', linewidth=2, label="Variance")
    ax[1].axvline(x=max_val, color='gray', linestyle='dashed', linewidth=2, label="Standard Deviation")
    
    fig.suptitle(f"Data Distribution for {name}", color="brown")
    plt.legend()
    
    plt.show()
    

In [3]:
def analyser(dt):
    """function creates a new column in the dataframe and helps to calculate polarity
    Input - Takes in the raw data
    Output - takes the output data
    """
    df = dt.copy()
    analyzer = SentimentIntensityAnalyzer()
    df['sentiment'] = df['articles'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    return df

In [4]:
def build_model2(dt, model):
    "Build the model for TESLA stock Price Prediction"
    df = dt.copy()
    # Splitting the dataset
    X = df[["sentiment"]]
    y = df["mean"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    if model == "rg":
        # Building the model
        model = make_pipeline(
            SimpleImputer(),
            Ridge()
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        intercept = model.named_steps["ridge"].intercept_
        coef = model.named_steps["ridge"].coef_
        
        # Calculate accuracy
        accuracy = model.score(X_test, y_test)
        
        return model, rmse, intercept, coef, accuracy
    
    elif model == "lr":
        # Building the model
        model = make_pipeline(
            SimpleImputer(),
            LinearRegression(fit_intercept=True)
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        intercept = model.named_steps["linearregression"].intercept_
        coef = model.named_steps["linearregression"].coef_
        
        # Calculate accuracy
        accuracy = model.score(X_test, y_test)
        
        return model, rmse, intercept, coef, accuracy

In [5]:
def make_prediction(article, model):
    # Create a new column for sentiment analysis
    df = pd.DataFrame({"article": article}, index=[0])
    analyzer = SentimentIntensityAnalyzer()
    df["sentiment"] = df["article"].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    data = df["sentiment"]
    data = pd.DataFrame(data, index=[0])
    prediction = model.predict(data)[0].round(2)
    return f"Predicted Stock price: ${prediction}"

In [6]:
def build_model2(dt):
    "build the model"
    df = dt.copy()
    
    #splitting the dataset
    X = df[["sentiment"]]
    y = df["mean"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
    
    #building the model
    model = make_pipeline(
        SimpleImputer(),
        Ridge()
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse= np.sqrt(mse)
    
    intercept = model.named_steps["ridge"].intercept_
    coef = model.named_steps["ridge"].coef_
    
    return model, rmse, intercept, coef


In [7]:
!pip install vaderSentiment



In [8]:
import pandas as pd


df2 = pd.read_csv('stock_tweets.csv')
df2

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."
...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.


In [9]:
df2 = df2[df2["Stock Name"] == "TSLA"]

In [10]:
df2.head(100)

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."
...,...,...,...,...
95,2022-09-29 05:51:42+00:00,Next year the city streets are going to be ver...,TSLA,"Tesla, Inc."
96,2022-09-29 05:38:00+00:00,Tesla Full Self-Driving Beta is trained on dat...,TSLA,"Tesla, Inc."
97,2022-09-29 05:19:26+00:00,“The code you will write will at term run in m...,TSLA,"Tesla, Inc."
98,2022-09-29 04:46:00+00:00,FSD SAVES LIVES. $TSLA \n\nhttps://t.co/4p8wnl...,TSLA,"Tesla, Inc."


In [11]:
df2['Date'] = pd.to_datetime(df2['Date'])

In [12]:
# Convert the date column to datetime format
df2['Date'] = pd.to_datetime(df2['Date'])

# Group the data by day and concatenate the tweets in a row separated by commas
grouped_data = df2.groupby(pd.Grouper(key='Date', freq='D'))["Tweet"].apply(lambda x: ', '.join(x)).reset_index()

In [13]:
grouped_data["Tweet"][3]



In [14]:
grouped_data

Unnamed: 0,Date,Tweet
0,2021-09-30 00:00:00+00:00,"In other words, AMD has been giving Tesla pref..."
1,2021-10-01 00:00:00+00:00,Pelosi still scrambling to find enough votes t...
2,2021-10-02 00:00:00+00:00,When the fuzz has a car that can keep up with ...
3,2021-10-03 00:00:00+00:00,"If you really want to be successful in life, f..."
4,2021-10-04 00:00:00+00:00,STOCKS I AM WATCHING THIS WEEK \n\n$CEI \n$AMC...
...,...,...
360,2022-09-25 00:00:00+00:00,"When I drive on FSD Beta, I know Tesla full se..."
361,2022-09-26 00:00:00+00:00,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...
362,2022-09-27 00:00:00+00:00,4 years ago today I picked up my dream car. I ...
363,2022-09-28 00:00:00+00:00,⚡️Watchlist for 9/27/22⚡️\n\n✅ $SPY \n📈C &gt; ...


In [15]:
import yfinance as yf

# Set the date range for which to retrieve stock price data
start_date = '2021-09-30'
end_date = '2022-09-29'

# Get the Tesla stock data for the specified date range
tsla = yf.download('TSLA', start=start_date, end=end_date)

# Print the head of the DataFrame to verify it was loaded correctly
print(tsla.head())

[*********************100%***********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2021-09-30  260.333344  263.043335  258.333344  258.493347  258.493347   
2021-10-01  259.466675  260.260010  254.529999  258.406677  258.406677   
2021-10-04  265.500000  268.989990  258.706665  260.510010  260.510010   
2021-10-05  261.600006  265.769989  258.066681  260.196655  260.196655   
2021-10-06  258.733337  262.220001  257.739990  260.916656  260.916656   

              Volume  
Date                  
2021-09-30  53868000  
2021-10-01  51094200  
2021-10-04  91449900  
2021-10-05  55297800  
2021-10-06  43898400  


In [16]:
tsla.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2021-09-30 to 2022-09-28
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       251 non-null    float64
 1   High       251 non-null    float64
 2   Low        251 non-null    float64
 3   Close      251 non-null    float64
 4   Adj Close  251 non-null    float64
 5   Volume     251 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 13.7 KB


In [17]:
grouped_data["Date"] = pd.to_datetime(grouped_data["Date"]).dt.strftime('%Y-%m-%d')
grouped_data["Date"] = pd.to_datetime(grouped_data['Date'])

In [18]:
grouped_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    365 non-null    datetime64[ns]
 1   Tweet   365 non-null    object        
dtypes: datetime64[ns](1), object(1)
memory usage: 5.8+ KB


In [19]:
dfx = pd.merge(grouped_data, tsla, on="Date")

In [20]:
dfx


Unnamed: 0,Date,Tweet,Open,High,Low,Close,Adj Close,Volume
0,2021-09-30,"In other words, AMD has been giving Tesla pref...",260.333344,263.043335,258.333344,258.493347,258.493347,53868000
1,2021-10-01,Pelosi still scrambling to find enough votes t...,259.466675,260.260010,254.529999,258.406677,258.406677,51094200
2,2021-10-04,STOCKS I AM WATCHING THIS WEEK \n\n$CEI \n$AMC...,265.500000,268.989990,258.706665,260.510010,260.510010,91449900
3,2021-10-05,Most Mentioned Tickers on WSB (via Swaggy Stoc...,261.600006,265.769989,258.066681,260.196655,260.196655,55297800
4,2021-10-06,If you invested $1K into these #assets at the ...,258.733337,262.220001,257.739990,260.916656,260.916656,43898400
...,...,...,...,...,...,...,...,...
246,2022-09-22,@Teslaconomics Hold tight $TSLA investors. Our...,299.859985,301.290009,285.820007,288.589996,288.589996,70545400
247,2022-09-23,3. Scanning for Option Trades\n\nPICK 3-5 Stoc...,283.089996,284.500000,272.820007,275.329987,275.329987,63748400
248,2022-09-26,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...,271.829987,284.089996,270.309998,276.010010,276.010010,58076900
249,2022-09-27,4 years ago today I picked up my dream car. I ...,283.839996,288.670013,277.510010,282.940002,282.940002,61925200


In [21]:
# Create a new column for sentiment analysis for TESLA stocks
def analyser2(dt):
    """function creates a new column in the dataframe and helps to calculate polarity
    Input - Takes in the raw data
    Output - takes the output data
    """
    df = dt.copy()
    analyzer = SentimentIntensityAnalyzer()
    df['sentiment'] = df['Tweet'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    return df

dfx = analyser2(dfx)

In [22]:
dfx


Unnamed: 0,Date,Tweet,Open,High,Low,Close,Adj Close,Volume,sentiment
0,2021-09-30,"In other words, AMD has been giving Tesla pref...",260.333344,263.043335,258.333344,258.493347,258.493347,53868000,0.9998
1,2021-10-01,Pelosi still scrambling to find enough votes t...,259.466675,260.260010,254.529999,258.406677,258.406677,51094200,0.9998
2,2021-10-04,STOCKS I AM WATCHING THIS WEEK \n\n$CEI \n$AMC...,265.500000,268.989990,258.706665,260.510010,260.510010,91449900,0.9993
3,2021-10-05,Most Mentioned Tickers on WSB (via Swaggy Stoc...,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,0.9992
4,2021-10-06,If you invested $1K into these #assets at the ...,258.733337,262.220001,257.739990,260.916656,260.916656,43898400,0.9998
...,...,...,...,...,...,...,...,...,...
246,2022-09-22,@Teslaconomics Hold tight $TSLA investors. Our...,299.859985,301.290009,285.820007,288.589996,288.589996,70545400,0.9972
247,2022-09-23,3. Scanning for Option Trades\n\nPICK 3-5 Stoc...,283.089996,284.500000,272.820007,275.329987,275.329987,63748400,0.9994
248,2022-09-26,$TSLA - Above 273 - Trade Idea 💡 - Sept 30 280...,271.829987,284.089996,270.309998,276.010010,276.010010,58076900,0.9991
249,2022-09-27,4 years ago today I picked up my dream car. I ...,283.839996,288.670013,277.510010,282.940002,282.940002,61925200,0.9994


In [23]:

dfx["mean"] = (dfx["Open"] + dfx["High"] + dfx["Low"] + dfx["Close"] + dfx["Adj Close"])/5
model, rmse, intercept, coef = build_model2(dfx)

In [24]:
print(f"The equation can be expressed as: Price = {intercept} * sentiment + {coef}")
print(f"The root mean square error is {rmse}")

The equation can be expressed as: Price = 289.4461750534144 * sentiment + [10.68013342]
The root mean square error is 40.39006354436193


In [25]:
#making prediction on a sample string
make_prediction(". A Status Quo Secretary General. Best Buy and Circuit City Report Brisk Sales for December. \
China Currency Near Parity With Hong Kong’s Dollar. Ex-Merrill Lynch Analyst Sentenced for Insider Trading. \
Companies Pay Dearly for Tech Trade Show. France: Inmate Suspected of Cannibalism. Mr. Ford Gets the Last Laugh. \
San Franciscans Hurl Their Rage at Parking Patrol. Abroad at Home. Monkey on a Tiger. Who Is Best at R&D?.\
Tanzanian Woman Is Chosen for U.N.’s 2nd Highest Post. Leaked Notes on &#8217;08 Show Giuliani Without the Spin.\
Images of Hanging Make Hussein a Martyr to Many. Just Days Into the Year, Killings Toll Hits 8 in New Orleans. \
Regulators Issue Rules on High-Risk Dealings. Iraqi Premier Announces Militia Crackdown.\
Chirac Says Time Proves France Was Right to Resist War. Verizon to Announce Plan for TV Shows on Cellphones. \
Goal Is to Look Good, Writer of Democratic Memoir Admits. Schwarzenegger Opens in a Sequel. San Francisco to Go Wireless.\
Tijuana Police Suspend Patrols After Soldiers Seize Their Guns. Safe as Milk?.\
Editor’s E-Mail May Be Used in Suit Against The Times. A Renegade’s Tale of His Scorn for Japan’s ‘Club of Old Men’. \
China Moves to Tighten the Money Supply. Google Makes Another Investment in the Internet in China.\
Colorado, Still Recovering From 2 Storms, Is Hit With a 3rd. How I Spent My Summer Vacation: Now a Major Motionless Book. \
Surge in Jobs Sends Markets Lower. Katrina Victims Find a Solution: Modular House. A Meeting With Gerald Ford. \
Job Growth Is Strong, Surprising Economists. Encouraging More Reality in Economics. U.N. Expels 6 in Sex Case.\
Germany: Sentence Trial for 9/11 Friend. Britain: New Restaurant on Radiation Trail. Iran Should Give Sanctions &#8216;Serious Response,&#8217; China Says.\
The Senate’s Task on Warming. Ryan Clark Crocker, a Diplomat Used to Danger. Search Continues for Survivors of Indonesian Ferry Accident. The Vanishing Man of the Forest. \
At Armenian Churches, a Distinct Observance Today. Making No Small Plans.\
American Diplomat to Visit Strife-Torn Somali Capital. Zalmay Khalilzad, on to a New Trouble Spot. Ties to Communist Secret Police Snare Polish Bishop.\
Copyrights and Congress. Monument to the Planet Suffers a Hard Fall to Earth. Health Guidelines Suggested for Models. \
France: Paris Driver Opens Fire; No Injuries. Colombia: 6-Year Hostage Flees Rebels. Sri Lanka: Bomb Attack on Bus Kills 5.\
China Media Battle Hints at Shift on Intellectual Property. Highly Paid Chief Is Paid $210 Million to Go Away.\
The Land of Rising Conservation. Viacom Selects President for MTV International Unit.\
Bush Facing Deep Divide Over More Troops for Iraq. At Home Depot, the New Chief Needs to Sell an Old Story to Wall Street. \
Germany: What’s in It for Gelnhausen?. A Future With the Wind. A New Commander, in Step With the White House on Iraq. \
Tipping Over a Defense of Enron. Appeals Court Reverses Convictions of 2 Westar Energy Executives.\
Day After Killings, a Hamas Leader Calls for a Truce. Dow&#8217;s Dogs Still Having Their Day. U.S. and Japan Issue Warnings to North Korea on Nuclear Test.\
Britain: Cartoons Protester Convicted. Chile: California Sailor Rescued. Canada: Liberal Lawmaker Defects. Battlegrounds: An Op-Ed Series. Sunday's Breakfast Menu: Jan. 7.\
Who Will Bring the Internet to Your TV?. Fun Facts. Tesla Chargers. The Macworld-C.E.S. Conflict. In da Car at Dakar", model)

'Predicted Stock price: $279.14'

In [26]:
import joblib
joblib.dump(model, "Stock_prices_model")

['Stock_prices_model']