In [24]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [25]:
#Reading csv files combined CSV.
merged_train=pd.read_csv("merged_train.csv")
merged_test=pd.read_csv("merged_test.csv")

In [26]:
#Dropping the duplicates.
merged_train.drop_duplicates(inplace=True)
merged_test.drop_duplicates(inplace=True)

In [27]:
merged_test.isna().sum()

Unnamed: 0         0
Column0            2
Column1            0
Column2            0
Column3        42234
Column4        42710
Column5        55659
Column6         1234
Column7            0
Column8         1234
Column9       243853
Column10           0
Column11           0
Column12           0
Column13           0
Column14      121679
Column15        5485
Column16           0
Column17           0
Column18           0
Column19           0
Column20           0
Column21           0
target             0
dtype: int64

In [28]:
merged_train.isna().sum()

Unnamed: 0         0
Column0            9
Column1            0
Column2            0
Column3       126303
Column4       127710
Column5       167180
Column6         3850
Column7            0
Column8         3850
Column9       732137
Column10           0
Column11           0
Column12           0
Column13           0
Column14      365703
Column15       16456
Column16           0
Column17           0
Column18           0
Column19           0
Column20           0
Column21           0
target             0
dtype: int64

In [29]:
merged_test.Column0.value_counts()

Column0
0.0     210054
2.0      24616
1.0      18312
5.0       2117
4.0       2112
6.0       2007
7.0       1401
3.0        549
9.0        159
8.0        143
11.0       112
12.0        54
14.0        30
15.0        22
16.0         7
10.0         5
13.0         5
18.0         3
17.0         2
Name: count, dtype: int64

In [30]:
#Dropping columns based on certain conditions.
merged_train.drop(columns=['Unnamed: 0','Column10','Column11','Column13','Column16','Column9','Column14'],inplace=True)
merged_test.drop(columns=['Unnamed: 0','Column10','Column11','Column13','Column16','Column9','Column14'],inplace=True)

In [31]:
mode_value = merged_train['Column0'].mode()[0]

# Fill missing values in 'Column0' with the mode
merged_train['Column0'].fillna(mode_value, inplace=True)
merged_test['Column0'].fillna(mode_value, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_train['Column0'].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_test['Column0'].fillna(mode_value, inplace=True)


In [32]:
#Handling null values.
merged_train=merged_train.loc[~(merged_train['Column0']).isnull()]
merged_test=merged_test.loc[~(merged_test['Column0'].isnull())]

In [33]:
merged_train.isna().sum()

Column0          0
Column1          0
Column2          0
Column3     126303
Column4     127710
Column5     167180
Column6       3850
Column7          0
Column8       3850
Column12         0
Column15     16456
Column17         0
Column18         0
Column19         0
Column20         0
Column21         0
target           0
dtype: int64

In [34]:
#Median impute the columns 15 and 5 due to their skewness.
merged_train['Column15']=merged_train['Column15'].fillna(merged_train['Column15'].median())
merged_test['Column15']=merged_test['Column15'].fillna(merged_test['Column15'].median())

merged_train['Column5']=merged_train['Column5'].fillna(merged_train['Column5'].median())
merged_test['Column5']=merged_test['Column5'].fillna(merged_test['Column5'].median())


In [35]:
merged_train=merged_train.loc[~(merged_train['Column6'].isnull() & merged_train['Column8'].isnull())]
merged_test=merged_test.loc[~(merged_test['Column6'].isnull() & merged_test['Column8'].isnull())]

In [36]:
#Imputing column3 and column4 usinf MICE, cuz these two are linearly co-related to each other.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=100, max_iter=250)

mice_imputed_test=imputer.fit_transform(merged_test[['Column3','Column4']])
mice_imputed_train=imputer.fit_transform(merged_train[['Column3','Column4']])

merged_test[['Column3','Column4']]=mice_imputed_test
merged_train[['Column3','Column4']]=mice_imputed_train




In [37]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def Scalerr(merged_train, merged_test, cols_to_scale):
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Fit the scaler on the training set only
    scaler.fit(merged_train[cols_to_scale])
    
    # Transform both the training and test set
    merged_train[cols_to_scale] = pd.DataFrame(scaler.transform(merged_train[cols_to_scale]), 
                                               columns=cols_to_scale, 
                                               index=merged_train.index)
    
    merged_test[cols_to_scale] = pd.DataFrame(scaler.transform(merged_test[cols_to_scale]), 
                                              columns=cols_to_scale, 
                                              index=merged_test.index)

# Example usage:
cols_to_scale = ['Column1', 'Column2']  # Add as many columns as you want to scale
Scalerr(merged_train, merged_test, cols_to_scale)







In [38]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)  # You can choose the number of components based on your needs
features_train = merged_train[['Column3','Column4']]
features_test=merged_test[['Column3','Column4']]
merged_train['Col_3&4'] = pca.fit_transform(features_train)
merged_test['Col_3&4']=pca.fit_transform(features_test)

In [39]:
transformed_train = merged_train.copy()
transformed_test=merged_test.copy()


In [40]:
import numpy as np
from sklearn.preprocessing import PowerTransformer

negative_skewed_columns = ['Column5','Column6', 'Column7', 'Column8', 'Column15', 'Column0','Column17','Column18']
       
def yeo_johnson_transform(df, columns):
    pt = PowerTransformer(method='yeo-johnson')
    for col in columns:
        df[col] = pt.fit_transform(df[[col]])  # Transform the original column directly
    return df

transformed_train = yeo_johnson_transform(transformed_train, negative_skewed_columns)

# Optionally, you can print the transformed data or check the skewness
# print(transformed_train[positive_skewed_columns].skew())
print(transformed_train[negative_skewed_columns].skew())


Column5     27.597343
Column6      1.461022
Column7      8.738096
Column8      0.252854
Column15    48.329992
Column0      1.524206
Column17     6.286120
Column18     2.208732
dtype: float64


In [41]:
# List of columns and their skewness types (to decide the transformation technique)


negative_skewed_columns = ['Column5','Column6', 'Column7', 'Column8','Column15','Column0','Column17','Column18']


# 2. Apply Yeo-Johnson Transformation for Highly Negative or Mixed Skewed Data
def yeo_johnson_transform(df, columns):
    pt = PowerTransformer(method='yeo-johnson')
    for col in columns:
        df[col] = pt.fit_transform(df[[col]])  # Transform the original column directly
    return df

transformed_test = yeo_johnson_transform(transformed_test, negative_skewed_columns)


In [42]:
import pandas as pd
from scipy.stats.mstats import winsorize

# Function to winsorize outliers for multiple columns
def winsorize_outliers(df, columns, lower_percentile=0.05, upper_percentile=0.95):
    
    for col in columns:
        non_null_mask = df[col].notnull()
        df.loc[non_null_mask, col] = winsorize(df.loc[non_null_mask, col], limits=(lower_percentile, 1 - upper_percentile))
        
    return df

# Example usage:
# Assuming 'data' is your DataFrame and you want to winsorize 'Column0', 'Column3', and 'Column5'
columns_to_winsorize = ['Column0','Column5','Column6', 'Column7', 'Column8','Column15','Column17',
       'Column18']
transformed_train = winsorize_outliers(transformed_train, columns_to_winsorize, lower_percentile=0.01, upper_percentile=0.99)
transformed_test = winsorize_outliers(transformed_test, columns_to_winsorize, lower_percentile=0.01, upper_percentile=0.99)



In [43]:
ready_train=transformed_train
ready_test=transformed_test

In [44]:
#Post Transformation.

c = ['Column0','Column5',
       'Column6', 'Column7', 'Column8','Column15', 'Column17',
       'Column18','Col_3&4']

scaler = RobustScaler() 

ready_train[c] = scaler.fit_transform(ready_train[c])
ready_test[c] = scaler.fit_transform(ready_test[c])

In [45]:
X_train=ready_train.drop(columns=['target','Column3','Column4'],axis=1)
y_train=ready_train['target']

X_test=ready_test.drop(columns=['target','Column3','Column4'],axis=1)
y_test=ready_test['target']

In [46]:
least_important_cols=['Column0','Column2', 'Column5', 'Column6', 'Column7',
       'Column8','Column15', 'Column17','Col_3&4']
#least important columns foud by feature importance. to the target variable.

In [47]:
from sklearn.cluster import KMeans
import pandas as pd

least_important_features = X_train[least_important_cols]

# Step 1: Apply KMeans clustering on the least important features
kmeans = KMeans(n_clusters=2, random_state=42)  # Adjust number of clusters as needed
kmeans.fit(least_important_features)

# Step 2: Compute the distance of each data point to each cluster centroid
distances = kmeans.transform(least_important_features)  # This gives an array with shape (n_samples, n_clusters)

# Step 3: Add the distance matrix as new features to the main dataset
# Each column represents the distance to one cluster centroid
for i in range(distances.shape[1]):
    X_train[f'Distance_to_Cluster_{i}'] = distances[:, i]

test_feature_scaled=X_test[least_important_cols]
# Compute the distance of test data points to each cluster centroid
test_distances = kmeans.transform(test_feature_scaled)  # This will give (n_test_samples, n_clusters) matrix

# Add the distances as new features to the test dataset
for i in range(test_distances.shape[1]):
    X_test[f'Distance_to_Cluster_{i}'] = test_distances[:, i]


In [48]:
import joblib

In [50]:
joblib.dump(kmeans,'kmeans_model.pkl')

['kmeans_model.pkl']

In [None]:
    all.