In [25]:
# Step 3:  Data Preprocessing and Preparation
# In this step, we will transform the dataset into a form suitable for training machine learning models:

In [47]:
#laibraries will use it in the AI Project
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import mstats

In [48]:
# Load the dataset
file_path = '/content/sample_data/TG_T_CashValues_Rel.xlsx'
df = pd.read_excel(file_path, sheet_name='TG80_T_Barwerte')

In [49]:
ndf=df.copy()

In [50]:
# Step 1: Encoding Categorical Features (Gender)
# Label encode Gender (Male -> 0, Female -> 1)
if 'Gender' in df.columns:
    le = LabelEncoder()
    ndf['Gender'] = le.fit_transform(df['Gender'])
    print("\nGender column encoded: 0 = Male, 1 = Female")
else:
    print("Gender column not found in the dataset.")


Gender column encoded: 0 = Male, 1 = Female


In [51]:
print(df)
print("ndf")
print(ndf)

      Gender  Age  Dur        PPV
0       Male    0   20  15.198437
1       Male    0   21  15.741015
2       Male    0   22  16.267189
3       Male    0   23  16.777448
4       Male    0   24  17.272269
...      ...  ...  ...        ...
5265  Female   78    2   1.925421
5266  Female   78    3   2.777638
5267  Female   79    1   1.000000
5268  Female   79    2   1.920897
5269  Female   80    1   1.000000

[5270 rows x 4 columns]
ndf
      Gender  Age  Dur        PPV
0          1    0   20  15.198437
1          1    0   21  15.741015
2          1    0   22  16.267189
3          1    0   23  16.777448
4          1    0   24  17.272269
...      ...  ...  ...        ...
5265       0   78    2   1.925421
5266       0   78    3   2.777638
5267       0   79    1   1.000000
5268       0   79    2   1.920897
5269       0   80    1   1.000000

[5270 rows x 4 columns]


In [54]:
# Step 2: Scaling Numerical Features (Age, Duration)
# Standardize Age and Duration using StandardScaler
scaler = StandardScaler()
numerical_columns = ['Age', 'Dur'] # Adjust column names if necessary

In [55]:
try:
    ndf[numerical_columns] = scaler.fit_transform(ndf[numerical_columns])
    print(f"Numerical columns {numerical_columns} successfully scaled.")
except KeyError as e:
    print(f"Error: {e}. Ensure numerical columns exist in the dataset.")

Numerical columns ['Age', 'Dur'] successfully scaled.


In [56]:
# Step 3: Handling Outliers
# Winsorize numerical features to handle outliers
outlier_columns = ['Age', 'Dur', 'PPV']  # Columns to handle outliers for
if set(outlier_columns).issubset(ndf.columns):
    for col in outlier_columns:
        ndf[col] = mstats.winsorize(ndf[col], limits=[0.01, 0.01])  # Trims top and bottom 1%
    print(f"Outliers in {outlier_columns} handled using winsorization.")
else:
    print("Error: Outlier columns missing in dataset.")


Outliers in ['Age', 'Dur', 'PPV'] handled using winsorization.


In [57]:
# Step 4: Splitting Data
# Separate features (X) and target variable (y)
X = ndf[['Gender', 'Age', 'Dur']]  # Features for training
y = ndf['PPV']  # Target variable
print("Features and target variable separated.")

Features and target variable separated.


In [58]:
# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

Data split into training and testing sets.


In [59]:
# Step 5: Feature Selection (if necessary)
# All features ('Gender', 'Age', 'Dur') are retained in this case
# (No action needed as they're already prepared for modeling)

In [60]:

# Step 6: Prepare for Modeling
# Confirm alignment of features and target variable
print("Preprocessed data is ready for model training.")
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Preprocessed data is ready for model training.
Training set size: 4216 rows
Testing set size: 1054 rows


In [65]:
ndf

Unnamed: 0,Gender,Age,Dur,PPV
0,1,-1.676191,-0.208986,15.198437
1,1,-1.676191,-0.136481,15.741015
2,1,-1.676191,-0.063975,16.267189
3,1,-1.676191,0.008530,16.777448
4,1,-1.676191,0.081035,17.272269
...,...,...,...,...
5265,0,2.287434,-1.514083,1.925421
5266,0,2.287434,-1.441578,2.777638
5267,0,2.287434,-1.586588,1.000000
5268,0,2.287434,-1.514083,1.920897


In [66]:
# Save the preprocessed dataset to a new CSV file
ndf.to_csv('preprocessed_data.csv', index=False)
print("Preprocessed data saved to 'preprocessed_data.csv'")

Preprocessed data saved to 'preprocessed_data.csv'


In [67]:
# Save features (X) and target (y) to separate CSV files
X.to_csv('features.csv', index=False)
y.to_csv('target.csv', index=False)
print("Features and target saved to 'features.csv' and 'target.csv'")

Features and target saved to 'features.csv' and 'target.csv'
