In [17]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
import yfinance as yf

In [18]:
# Load the CSV data into a DataFrame
file_path = Path("CorporateStockSplits_10yr_TechOnly.csv")
CSV_Split_df = pd.read_csv(file_path)

# had to remove date as index because its not unique and the one hot encoder requires unique index values. 
# , index_col = "Date", infer_datetime_format = True 

# Display sample data
Full_Split_df = CSV_Split_df.dropna()
Full_Split_df

Unnamed: 0,Date,Symbol,Company Name,Type,Split Ratio,Split Number,Industry,Price on day of Split,Price Prior to Split,Date4Q,...,Price4Q,Date3Q,Volume3Q,Price3Q,Date2Q,Volume2Q,Price2Q,Date1Q,Volume1Q,Price1Q
0,4/18/2023,INTC,Intel Corporation,,,1,Technology,31.83,31.83,4/22/2022,...,46.54,7/21/2022,29661198,40.61,10/19/2022,33882274,26.00,1/18/2023,44199580,28.81
1,4/18/2023,MXL,MaxLinear Inc.,,,1,Technology,31.59,31.59,4/22/2022,...,44.14,7/21/2022,693403,40.41,10/19/2022,807336,30.45,1/18/2023,489581,36.77
2,4/18/2023,MRVL,Marvell Technology Group Ltd.,,,1,Technology,42.24,42.24,4/22/2022,...,58.44,7/21/2022,7900197,54.57,10/19/2022,8271283,38.10,1/18/2023,7445092,40.49
3,4/18/2023,STM,STMicroelectronics NV,,,1,Technology,50.59,50.59,4/22/2022,...,37.47,7/21/2022,3288557,35.22,10/19/2022,3486155,31.82,1/18/2023,4139197,42.29
4,4/18/2023,MU,Micron Technology Inc.,,,1,Technology,61.93,61.93,4/22/2022,...,69.41,7/21/2022,12924682,63.64,10/19/2022,14786102,53.03,1/18/2023,10035267,56.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,6/15/2022,SBIG,Springbig Holdings Inc,Forward,2 for 1,2,Technology,4.50,9.00,6/15/2021,...,9.70,9/15/2021,934,9.70,12/17/2021,16316,9.86,3/17/2022,27561,19.86
59,6/23/2022,FTNT,Fortinet Inc,Forward,5 for 1,5,Technology,56.80,284.00,6/23/2021,...,240.05,9/23/2021,977481,310.55,12/27/2021,1019028,367.65,3/25/2022,858572,332.75
60,6/29/2022,SHOP,Shopify Inc,Forward,10 for 1,10,Technology,33.05,330.50,6/29/2021,...,1483.00,9/29/2021,1005928,1346.60,12/31/2021,471261,1377.40,3/31/2022,2222722,676.00
61,9/14/2022,PANW,Palo Alto Networks Inc,Forward,3 for 1,3,Technology,182.06,546.18,9/14/2021,...,484.83,12/15/2021,1569519,540.30,3/18/2022,2167555,577.02,6/16/2022,1367100,466.32


In [19]:
# Have to remove reverse split companies because that is going to throw off our test population.
Split_df = Full_Split_df.loc[Full_Split_df["Type"].isin(['None', 'Forward'])] 

Split_df

Unnamed: 0,Date,Symbol,Company Name,Type,Split Ratio,Split Number,Industry,Price on day of Split,Price Prior to Split,Date4Q,...,Price4Q,Date3Q,Volume3Q,Price3Q,Date2Q,Volume2Q,Price2Q,Date1Q,Volume1Q,Price1Q
0,4/18/2023,INTC,Intel Corporation,,,1,Technology,31.83,31.83,4/22/2022,...,46.54,7/21/2022,29661198,40.61,10/19/2022,33882274,26.00,1/18/2023,44199580,28.81
1,4/18/2023,MXL,MaxLinear Inc.,,,1,Technology,31.59,31.59,4/22/2022,...,44.14,7/21/2022,693403,40.41,10/19/2022,807336,30.45,1/18/2023,489581,36.77
2,4/18/2023,MRVL,Marvell Technology Group Ltd.,,,1,Technology,42.24,42.24,4/22/2022,...,58.44,7/21/2022,7900197,54.57,10/19/2022,8271283,38.10,1/18/2023,7445092,40.49
3,4/18/2023,STM,STMicroelectronics NV,,,1,Technology,50.59,50.59,4/22/2022,...,37.47,7/21/2022,3288557,35.22,10/19/2022,3486155,31.82,1/18/2023,4139197,42.29
4,4/18/2023,MU,Micron Technology Inc.,,,1,Technology,61.93,61.93,4/22/2022,...,69.41,7/21/2022,12924682,63.64,10/19/2022,14786102,53.03,1/18/2023,10035267,56.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,6/15/2022,SBIG,Springbig Holdings Inc,Forward,2 for 1,2,Technology,4.50,9.00,6/15/2021,...,9.70,9/15/2021,934,9.70,12/17/2021,16316,9.86,3/17/2022,27561,19.86
59,6/23/2022,FTNT,Fortinet Inc,Forward,5 for 1,5,Technology,56.80,284.00,6/23/2021,...,240.05,9/23/2021,977481,310.55,12/27/2021,1019028,367.65,3/25/2022,858572,332.75
60,6/29/2022,SHOP,Shopify Inc,Forward,10 for 1,10,Technology,33.05,330.50,6/29/2021,...,1483.00,9/29/2021,1005928,1346.60,12/31/2021,471261,1377.40,3/31/2022,2222722,676.00
61,9/14/2022,PANW,Palo Alto Networks Inc,Forward,3 for 1,3,Technology,182.06,546.18,9/14/2021,...,484.83,12/15/2021,1569519,540.30,3/18/2022,2167555,577.02,6/16/2022,1367100,466.32


In [20]:
enc = OneHotEncoder(sparse=False)

In [21]:
categorical = ["Type"]

In [22]:
encoded_data = enc.fit_transform(Split_df[categorical])

In [23]:
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical)
)

# Display sample data
encoded_df.head()



Unnamed: 0,Type_Forward,Type_None
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [24]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
Full_df = pd.concat(
    [Split_df[["Price on day of Split", "Price Prior to Split", "Split Number","Volume4Q","Price4Q","Volume3Q","Price3Q","Volume2Q","Price2Q","Volume1Q","Price1Q"]], encoded_df],axis=1)
	# "Date4Q" "Date3Q" "Date2Q" "Date1Q"		

# Display sample data
Full_df.head()

Unnamed: 0,Price on day of Split,Price Prior to Split,Split Number,Volume4Q,Price4Q,Volume3Q,Price3Q,Volume2Q,Price2Q,Volume1Q,Price1Q,Type_Forward,Type_None
0,31.83,31.83,1,28219860,46.54,29661198,40.61,33882274,26.0,44199580,28.81,0.0,1.0
1,31.59,31.59,1,494735,44.14,693403,40.41,807336,30.45,489581,36.77,0.0,1.0
2,42.24,42.24,1,8638103,58.44,7900197,54.57,8271283,38.1,7445092,40.49,0.0,1.0
3,50.59,50.59,1,3460450,37.47,3288557,35.22,3486155,31.82,4139197,42.29,0.0,1.0
4,61.93,61.93,1,18682432,69.41,12924682,63.64,14786102,53.03,10035267,56.51,0.0,1.0


In [25]:
# Creating the features set X
X = Full_df.drop(columns=["Type_Forward","Type_None"], axis=1)

# Display sample data
X

Unnamed: 0,Price on day of Split,Price Prior to Split,Split Number,Volume4Q,Price4Q,Volume3Q,Price3Q,Volume2Q,Price2Q,Volume1Q,Price1Q
0,31.83,31.83,1,28219860,46.54,29661198,40.61,33882274,26.00,44199580,28.81
1,31.59,31.59,1,494735,44.14,693403,40.41,807336,30.45,489581,36.77
2,42.24,42.24,1,8638103,58.44,7900197,54.57,8271283,38.10,7445092,40.49
3,50.59,50.59,1,3460450,37.47,3288557,35.22,3486155,31.82,4139197,42.29
4,61.93,61.93,1,18682432,69.41,12924682,63.64,14786102,53.03,10035267,56.51
...,...,...,...,...,...,...,...,...,...,...,...
58,4.50,9.00,2,8145,9.70,934,9.70,16316,9.86,27561,19.86
59,56.80,284.00,5,781067,240.05,977481,310.55,1019028,367.65,858572,332.75
60,33.05,330.50,10,1058087,1483.00,1005928,1346.60,471261,1377.40,2222722,676.00
61,182.06,546.18,3,2464777,484.83,1569519,540.30,2167555,577.02,1367100,466.32


In [26]:
# Creating the target set y
y = encoded_df["Type_None"]

# Display sample data
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Type_None, dtype: float64

In [27]:
# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
model = tree.DecisionTreeClassifier()

In [30]:
model = model.fit(X_train_scaled, y_train)

In [31]:
predictions = model.predict(X_test_scaled)

In [32]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        15
         1.0       1.00      1.00      1.00         1

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [33]:
# Trying out a logistic Regression Model

In [34]:

classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [35]:
 LogisticRegression(random_state=1)

LogisticRegression(random_state=1)

In [37]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [38]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.5319148936170213
Testing Data Score: 0.3125


In [40]:
predictions_LR = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions_LR, "Actual": y_test})

Unnamed: 0,Prediction,Actual
24,0.0,0.0
39,1.0,0.0
51,1.0,0.0
27,0.0,0.0
61,1.0,0.0
2,1.0,1.0
21,1.0,0.0
42,0.0,0.0
41,1.0,0.0
44,1.0,0.0


In [41]:
# Display the accuracy score for the Logistic Regression Model.
accuracy_score(y_test, predictions_LR)

0.3125

In [44]:
# As we are able to see with an accuracy score of 31.25% this model isnt the most accurate thing in the world.  
print("Your model is accurate ",accuracy_score(y_test, predictions_LR)*100 , "% of the time every time")

Your model is accurate  31.25 % of the time every time
