In [1]:
import pandas as pd

df = pd.read_csv("combined_data.csv")
print("Original shape:", df.shape)
df.head()

Original shape: (3900, 179)


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BMGMCA,BVCH,BVCD,BVCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA
0,SP1,15/08/2024,18:00,Ath Bilbao,Getafe,1,1,D,1,0,...,,,,,,,,,,
1,SP1,15/08/2024,20:30,Betis,Girona,1,1,D,1,0,...,,,,,,,,,,
2,SP1,16/08/2024,18:00,Celta,Alaves,2,1,H,0,1,...,,,,,,,,,,
3,SP1,16/08/2024,20:30,Las Palmas,Sevilla,2,2,D,1,1,...,,,,,,,,,,
4,SP1,17/08/2024,18:00,Osasuna,Leganes,1,1,D,0,1,...,,,,,,,,,,


In [2]:
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print("Removed", before - after, "duplicate rows")
print("After duplicates:", df.shape)


Removed 0 duplicate rows
After duplicates: (3900, 179)


In [3]:
min_non_null = len(df) * 0.5
df = df.dropna(axis=1, thresh=min_non_null)
print("After dropping very empty columns:", df.shape)

After dropping very empty columns: (3900, 99)


In [4]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

print("Numeric cols:", len(numeric_cols))
print("Categorical cols:", len(cat_cols))

Numeric cols: 92
Categorical cols: 7


In [5]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

print("‚úÖ Missing values handled")


‚úÖ Missing values handled


In [6]:
from sklearn.preprocessing import LabelEncoder


if "FTR" in df.columns:
    if df["FTR"].dtype == "object":
        le = LabelEncoder()
        df["FTR"] = le.fit_transform(df["FTR"])
        print("‚úÖ FTR column encoded as numeric.")


    corr = df.corr(numeric_only=True)
    target = "FTR"

    if target in corr.columns:
        corr_values = corr[target].sort_values(ascending=False)
        print("üîπ Highest correlated features:\n", corr_values.head(20))
        print("\nüîπ Lowest correlated features:\n", corr_values.tail(20))
    else:
        print("‚ö†Ô∏è Target column still not found.")
else:
    print("‚ùå FTR column not found in the dataset.")



‚úÖ FTR column encoded as numeric.
üîπ Highest correlated features:
 FTR       1.000000
FTHG      0.613832
HTHG      0.413652
HST       0.345653
B365A     0.322149
PSCA      0.319370
PSA       0.318652
BWA       0.311001
WHA       0.310443
IWA       0.287933
VCA       0.278155
AvgA      0.267008
AvgCA     0.262039
B365CA    0.261037
MaxA      0.256867
MaxCA     0.250543
BWCA      0.248894
WHCA      0.242107
B365D     0.160485
WHD       0.158521
Name: FTR, dtype: float64

üîπ Lowest correlated features:
 AS       -0.159995
WHCH     -0.265218
BWCH     -0.271736
MaxH     -0.279543
MaxCH    -0.282421
AvgH     -0.283797
AvgCH    -0.287353
B365CH   -0.287437
IWH      -0.306803
VCH      -0.308749
AHh      -0.322888
AHCh     -0.324831
WHH      -0.334558
PSH      -0.335878
BWH      -0.337023
PSCH     -0.337588
B365H    -0.339821
AST      -0.375232
HTAG     -0.387004
FTAG     -0.615761
Name: FTR, dtype: float64


In [7]:
corr = df.corr(numeric_only=True)
target = "FTR"

if target in corr.columns:
    low_corr_cols = [col for col in corr.columns if col != target and abs(corr[target].loc[col]) < 0.02]
    print("Will drop:", low_corr_cols)


    df = df.drop(columns=low_corr_cols)
    print(f"‚úÖ Dropped {len(low_corr_cols)} low-correlation columns.")
else:
    print("‚ö†Ô∏è Target column not found in correlation matrix.")


Will drop: ['AF', 'AC', 'AY', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']
‚úÖ Dropped 15 low-correlation columns.


In [8]:
corr = df.corr(numeric_only=True)
target = "FTR"

if target in corr.columns:
    strong_corr_cols = [col for col in corr.columns if abs(corr[target].loc[col]) >= 0.25 or col == target]
    print("Will keep:", strong_corr_cols)

    df = df[strong_corr_cols]
    print(f"‚úÖ Kept {len(strong_corr_cols)} highly correlated features.")
else:
    print("‚ö†Ô∏è Target column not found in correlation matrix.")


Will keep: ['FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HST', 'AST', 'B365H', 'B365A', 'BWH', 'BWA', 'PSH', 'PSA', 'WHH', 'WHA', 'MaxH', 'MaxA', 'AvgH', 'AvgA', 'AHh', 'B365CH', 'B365CA', 'BWCH', 'PSCH', 'PSCA', 'WHCH', 'MaxCH', 'MaxCA', 'AvgCH', 'AvgCA', 'AHCh', 'IWH', 'IWA', 'VCH', 'VCA']
‚úÖ Kept 35 highly correlated features.


In [10]:
from sklearn.preprocessing import LabelEncoder


cat_cols = df.select_dtypes(include=["object"]).columns

le = LabelEncoder()
for c in cat_cols:
    if c in df.columns:
        df[c] = le.fit_transform(df[c])

print("‚úÖ Categorical columns encoded")


‚úÖ Categorical columns encoded


In [11]:
corr = df.corr(numeric_only=True)
target = "FTR"
if target in corr.columns:
    print(corr[target].sort_values(ascending=False).head(20))


FTR       1.000000
FTHG      0.613832
HTHG      0.413652
HST       0.345653
B365A     0.322149
PSCA      0.319370
PSA       0.318652
BWA       0.311001
WHA       0.310443
IWA       0.287933
VCA       0.278155
AvgA      0.267008
AvgCA     0.262039
B365CA    0.261037
MaxA      0.256867
MaxCA     0.250543
WHCH     -0.265218
BWCH     -0.271736
MaxH     -0.279543
MaxCH    -0.282421
Name: FTR, dtype: float64


In [12]:
low_corr_cols = [col for col in corr.columns if col != target and abs(corr[target].loc[col]) < 0.02]
print("Will drop:", low_corr_cols[:20])

df = df.drop(columns=low_corr_cols)

Will drop: []


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_cols = [c for c in numeric_cols if c in df.columns]

if numeric_cols:
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("‚úÖ Numeric columns normalized")
else:
    print("‚ö†Ô∏è No numeric columns found to normalize")


‚úÖ Numeric columns normalized


In [14]:
possible_cols = ["FTHG", "FTAG", "HTHG", "HTAG", "HS", "AS", "HST", "AST"]
outlier_cols = [c for c in possible_cols if c in df.columns]

print("Outlier check on:", outlier_cols)

if outlier_cols:
    Q1 = df[outlier_cols].quantile(0.25)
    Q3 = df[outlier_cols].quantile(0.75)
    IQR = Q3 - Q1

    before = df.shape[0]
    df = df[~((df[outlier_cols] < (Q1 - 3 * IQR)) | (df[outlier_cols] > (Q3 + 3 * IQR))).any(axis=1)]
    after = df.shape[0]
    print("Removed", before - after, "outliers")
else:
    print("No outliers step applied")


Outlier check on: ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HST', 'AST']
Removed 37 outliers


In [15]:
df.to_csv("combined_data_clean.csv", index=False)
print("Saved cleaned data as combined_data_clean.csv")
print("Final shape:", df.shape)

Saved cleaned data as combined_data_clean.csv
Final shape: (3863, 35)
