In [20]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [21]:
import warnings
# Suppress the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn.ensemble._weight_boosting')


In [22]:
df =pd.read_csv('Featured_Engineering_Data_To_CSV.csv')
df.drop('date',axis=1,inplace=True)
df.set_index('Name',inplace=True)
df.dropna(inplace=True)

In [23]:
all_years = pd.read_csv("all_stocks_5yr.csv")
row_counts = all_years.groupby(all_years.Name).size()
stocks_with_few_rows = row_counts[row_counts < 500].index
stocks_with_few_rows

Index(['APTV', 'BHF', 'BHGE', 'DWDP', 'DXC', 'EVHC', 'FTV', 'HLT', 'UA'], dtype='object', name='Name')

In [24]:
df = df[~df.index.isin(stocks_with_few_rows)].copy()


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Linear Discriminant Analysis' : LinearDiscriminantAnalysis()

}

In [26]:
df["Label2"] = df.groupby("Name").Pct_Change.shift(-1)
df["Label"] = df["Label2"] > 0
df.drop(columns=['Label2'], inplace=True)
df.head(3)

Unnamed: 0_level_0,open,high,low,close,volume,RSI,Pct_Change,MA20,MA50,MA100,EMA_6,EMA_12,EMA_26,High_to_Close,Low_to_Close,Label
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A,43.5,43.89,42.95,43.09,2986405,45.041816,-1.147052,43.694,43.871,43.1518,43.062438,43.250076,43.670602,0.8,-0.14,True
A,42.82,43.46,42.72,43.17,1940832,41.624365,0.185658,43.626,43.8822,43.1327,43.09317,43.237757,43.633521,0.29,-0.45,True
A,43.46,44.295,43.4,44.23,2100873,51.360947,2.455409,43.587,43.9098,43.129,43.417978,43.39041,43.677704,0.065,-0.83,True


In [27]:
print(df.columns)
df.columns[:-1]

Index(['open', 'high', 'low', 'close', 'volume', 'RSI', 'Pct_Change', 'MA20',
       'MA50', 'MA100', 'EMA_6', 'EMA_12', 'EMA_26', 'High_to_Close',
       'Low_to_Close', 'Label'],
      dtype='object')


Index(['open', 'high', 'low', 'close', 'volume', 'RSI', 'Pct_Change', 'MA20',
       'MA50', 'MA100', 'EMA_6', 'EMA_12', 'EMA_26', 'High_to_Close',
       'Low_to_Close'],
      dtype='object')

In [28]:
stocks = df.index.unique().tolist()
all_accuracies = {}

################################# PCA #####################################

In [29]:
# הגדרת המשתנה איקס להיות כל הפיטצ'רים חוץ מהלייבל שלי
X = df[df.columns[:-1]].values  

# הופך את כל הפיטצ'רים להיות בטווח שבין 0 ל1 וזה מאוד חשוב אצלנו כי כל המשחק השלנו הוא בין 0 ל1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# מפעיל את PCA
pca = PCA(0.95)  
# קבלת המידע החדש שהמרנו
X_pca = pca.fit_transform(X_scaled)

# יצירת הדאטה פריים שלנו שמורכב מהוקטורים החדשים שלנו
df_pca = pd.DataFrame(data=X_pca, index=df.index)
df_pca["Label"] = df["Label"]
df_pca.head(3)

Unnamed: 0_level_0,0,1,2,3,4,Label
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,-1.326065,-0.93229,-0.250785,-0.060095,-0.120215,True
A,-1.351881,-0.228362,-0.231431,-0.680317,0.039195,True
A,-1.311331,1.204748,-0.107142,-0.784619,0.630375,True


In [30]:
# כמה שונות כל וקטור תופס מהמידע המקורי
pca.explained_variance_ratio_

array([0.71767437, 0.09679411, 0.06576474, 0.05703182, 0.03055692])

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# לולאה שרצה על כל המניות 
for stock in stocks:
    # הגדרת המניה
    data = df_pca[df.index == stock].copy()
    # הגדרת איקס שלנו שזה כל הפיטצ'רים פחות הלייבל
    X = data[data.columns[:-1]]
    # הגדרת ואיי להיות הלייבל שלי
    y = data['Label']
        # חלוקת הנתונים לטריין וטסט
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    accuracy_results = {}
    
    for name, model in models.items():
            # אימון המודל
        model.fit(X_train, y_train)
            
            # חיזוי המודל
        y_pred = model.predict(X_test)
            
            # חישוב אחוזי דיוק
        accuracy = accuracy_score(y_test, y_pred)
            
            # אחסון אחוזי הדיוק
        accuracy_results[name] = accuracy
   
    
    # שמירת אחוזי הדיוק של כל מניה
    all_accuracies[stock] = accuracy_results


In [32]:
# הדפסת תוצאות הפי-סי-איי שלנו של כל מודל לדאטה פריים
accuracy_df = pd.DataFrame(all_accuracies).T
print(accuracy_df.head(3))
accuracy_df.tail(1)

     Random Forest  AdaBoost  K-Nearest Neighbors  \
A         0.482759  0.564655             0.487069   
AAL       0.506024  0.554217             0.518072   
AAP       0.461847  0.497992             0.477912   

     Linear Discriminant Analysis  
A                        0.581897  
AAL                      0.530120  
AAP                      0.473896  


Unnamed: 0,Random Forest,AdaBoost,K-Nearest Neighbors,Linear Discriminant Analysis
ZTS,0.534137,0.526104,0.526104,0.526104


In [33]:
accuracy_df.describe()

Unnamed: 0,Random Forest,AdaBoost,K-Nearest Neighbors,Linear Discriminant Analysis
count,496.0,496.0,496.0,496.0
mean,0.50658,0.507641,0.502414,0.517425
std,0.033324,0.031242,0.030081,0.034858
min,0.413655,0.425703,0.417671,0.421687
25%,0.482551,0.485944,0.481928,0.491342
50%,0.506024,0.506024,0.502008,0.518072
75%,0.53012,0.53012,0.526104,0.542169
max,0.62249,0.60241,0.598394,0.618474
