In [36]:
import pandas as pd
df = pd.read_csv("C:/Users/User/Desktop/AI_Projects/Project_05/Data/Raw_data/Top5_Leagues_2005_2025.csv")

In [37]:
df.head()

Unnamed: 0,Rank,Club,Squad size,Average age,Foreigners,Market value,Total market value,League,Season
0,,Bayern Munich,28,28.3,14,€6.35m,€177.88m,Bundesliga,2005
1,,SV Werder Bremen,28,25.7,13,€3.99m,€111.60m,Bundesliga,2005
2,,Hamburger SV,35,25.1,17,€3.05m,€106.90m,Bundesliga,2005
3,,FC Schalke 04,32,25.8,15,€2.72m,€87.18m,Bundesliga,2005
4,,Bayer 04 Leverkusen,33,24.9,15,€2.46m,€81.23m,Bundesliga,2005


In [38]:
df["Market value"] = df["Market value"].replace({'€':'',',':''}, regex=True)

df["Market value"] = df["Market value"].apply(
    lambda x: float(x.replace('bn', '')) * 1_000_000_000 if 'bn' in x
    else float(x.replace('m', '')) * 1_000_000 if 'm' in x
    else 0
)


df=df.drop("Rank",axis=1)


In [41]:
df['Category'] = pd.qcut(df['Market value'], q=3, labels=[2,1,0])

In [42]:
df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052 entries, 0 to 2051
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Club                2052 non-null   object  
 1   Squad size          2052 non-null   int64   
 2   Average age         2052 non-null   float64 
 3   Foreigners          2052 non-null   int64   
 4   Market value        2052 non-null   float64 
 5   Total market value  2052 non-null   object  
 6   League              2052 non-null   object  
 7   Season              2052 non-null   int64   
 8   Category            2052 non-null   category
dtypes: category(1), float64(2), int64(3), object(3)
memory usage: 130.5+ KB


Unnamed: 0,Club,Squad size,Average age,Foreigners,Market value,Total market value,League,Season,Category
0,Bayern Munich,28,28.3,14,6350000.0,€177.88m,Bundesliga,2005,0
1,SV Werder Bremen,28,25.7,13,3990000.0,€111.60m,Bundesliga,2005,1
2,Hamburger SV,35,25.1,17,3050000.0,€106.90m,Bundesliga,2005,1
3,FC Schalke 04,32,25.8,15,2720000.0,€87.18m,Bundesliga,2005,1
4,Bayer 04 Leverkusen,33,24.9,15,2460000.0,€81.23m,Bundesliga,2005,1


In [43]:
df.isnull().sum()

Club                  0
Squad size            0
Average age           0
Foreigners            0
Market value          0
Total market value    0
League                0
Season                0
Category              0
dtype: int64

In [44]:
import logging
import os
log_folder="C:/Users/User/Desktop/AI_Projects/Project_05/Log"
log_file = os.path.join(log_folder, 'preprocessing.log')
os.makedirs(log_folder, exist_ok=True)

logging.basicConfig(
    filename=log_file,
    filemode='a', 
    level=logging.INFO,
    format="%(asctime)s-%(levelname)s-%(message)s"
)



In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import logging

logging.basicConfig(
    filename="C:/Users/User/Desktop/AI_Projects/Project_05/Log/preprocessing.log",  # log fayl manzili
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

class DataPreprocessing:
    def __init__(self, df, target="category"):
        self.df = df.copy()
        self.target = target
        logging.info("DataPreprocessing class initialized")

    def missing_values(self):
        """Bo'sh qiymatlarni to'ldirish"""
        try:
            for col in self.df.columns:
                if self.df[col].isnull().any():
                    if self.df[col].dtype == "object":
                        self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                        logging.info(f"{col} ustunidagi bo'sh qiymatlar mode bilan to'ldirildi.")
                    else:
                        self.df[col].fillna(self.df[col].mean(), inplace=True)
                        logging.info(f"{col} ustunidagi bo'sh qiymatlar mean bilan to'ldirildi.")
            return self
        except Exception as e:
            logging.error(f"Missing value bosqichida xatolik: {e}")
            raise e

    def encoding(self):
        """Kategoriya ustunlarini kodlash"""
        try:
            encoder = LabelEncoder()
            for col in self.df.select_dtypes(include=["object"]).columns:
                if self.df[col].nunique() <= 2:
                    dummies = pd.get_dummies(self.df[col], dtype=int, prefix=col)
                    self.df = pd.concat([self.df.drop(columns=col), dummies], axis=1)
                    logging.info(f"{col} ustuni one-hot encoding qilindi.")
                else:
                    self.df[col] = encoder.fit_transform(self.df[col])
                    logging.info(f"{col} ustuni LabelEncoder bilan kodlandi.")
            return self
        except Exception as e:
            logging.error(f"Encoding bosqichida xatolik: {e}")
            raise e

    def scaling(self):
        """Raqamli ustunlarni normallashtirish"""
        try:
            scaler = MinMaxScaler()
            num_cols = self.df.select_dtypes(include=[np.number]).columns

            if self.target in num_cols:
                num_cols = num_cols.drop(self.target)

            self.df[num_cols] = scaler.fit_transform(self.df[num_cols])
            logging.info("Raqamli ustunlar MinMaxScaler bilan o'lchovga keltirildi.")
            return self
        except Exception as e:
            logging.error(f"Scaling bosqichida xatolik: {e}")
            raise e

    def get_data(self):
        """Tayyor ma'lumotni qaytarish"""
        logging.info("Tayyor ma'lumot olindi.")
        return self.df


In [46]:
dp = DataPreprocessing(df, target="category")
df = dp.missing_values().encoding().scaling().get_data()

In [47]:
df.head()

Unnamed: 0,Club,Squad size,Average age,Foreigners,Market value,Total market value,League,Season,Category
0,0.152381,0.117647,0.792208,0.341463,0.113312,0.242991,0.0,0.0,0
1,0.814286,0.117647,0.454545,0.317073,0.071199,0.057724,0.0,0.0,1
2,0.485714,0.220588,0.376623,0.414634,0.054425,0.040682,0.0,0.0,1
3,0.414286,0.176471,0.467532,0.365854,0.048537,0.927982,0.0,0.0,1
4,0.147619,0.191176,0.350649,0.365854,0.043897,0.8895,0.0,0.0,1


In [48]:
import os 
try:
    output_folder="C:/Users/User/Desktop/AI_Projects/Project_05/Data/Preprocessed_data"
    os.makedirs(output_folder, exist_ok=True)
    output_path=os.path.join(output_folder, "preprocessed_top_5.csv")
    df.to_csv(output_path, index=False)
    print(f"Data savet to -> {output_path}")
    logging.info("Maluot saqlandi")
except Exception as e:
    logging.error("Xatolik")

Data savet to -> C:/Users/User/Desktop/AI_Projects/Project_05/Data/Preprocessed_data\preprocessed_top_5.csv
