In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from feature_summary import export_feature_summary
from round_to_nearest import round_to_nearest_multiple
from split_dataset_by_missing_and_type import split_dataset_by_missing_and_type
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import hashlib
from sklearn.preprocessing import PowerTransformer

In [2]:
data = pd.read_csv('final_proj_data.csv')
test_data = pd.read_csv('final_proj_test.csv')

### Removing Not Needed Data

In [3]:
print(data.shape)
column_threshold = 0.30

missing_percent = data.isnull().mean()
columns_to_drop = missing_percent[missing_percent > column_threshold].index
cleaned_data = data.drop(columns=columns_to_drop)

print(f"Columns removed: {list(columns_to_drop)}")
print(f"New dataset shape after column removal: {cleaned_data.shape}")

row_threshold = int((1 - 0.30) * cleaned_data.shape[1])
cleaned_data = cleaned_data.dropna(thresh=row_threshold)
print(f"New dataset shape after removing rows with more than 30% missing columns: {cleaned_data.shape}")

cleaned_data.to_csv('fully_cleaned_data.csv', index=False)

(10000, 231)
Columns removed: ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var8', 'Var9', 'Var10', 'Var11', 'Var12', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var23', 'Var26', 'Var27', 'Var29', 'Var30', 'Var31', 'Var32', 'Var33', 'Var34', 'Var36', 'Var37', 'Var39', 'Var40', 'Var41', 'Var42', 'Var43', 'Var45', 'Var46', 'Var47', 'Var48', 'Var49', 'Var50', 'Var51', 'Var52', 'Var53', 'Var54', 'Var55', 'Var56', 'Var58', 'Var59', 'Var60', 'Var61', 'Var62', 'Var63', 'Var64', 'Var66', 'Var67', 'Var68', 'Var69', 'Var70', 'Var71', 'Var72', 'Var75', 'Var77', 'Var79', 'Var80', 'Var82', 'Var84', 'Var86', 'Var87', 'Var88', 'Var89', 'Var90', 'Var91', 'Var92', 'Var93', 'Var94', 'Var95', 'Var96', 'Var97', 'Var98', 'Var99', 'Var100', 'Var101', 'Var102', 'Var103', 'Var104', 'Var105', 'Var106', 'Var107', 'Var108', 'Var110', 'Var111', 'Var114', 'Var115', 'Var116', 'Var117', 'Var118', 'Var120', 'Var121', 'Var122', 'Var124', 'Var127', 'Var128', 'Var129', 'Var130', 'Var131', 'Var135', 'Var1

### Imputation

In [4]:
# MISSING VALUES
features_with_missing_values_count = summary_df[summary_df['Missing Value Count'] > 0].shape[0]
print(features_with_missing_values_count)
split_dataset_by_missing_and_type(summary_df)

NameError: name 'summary_df' is not defined

#### Numerical Imputation

In [None]:
# Numerical continuous variables with missing values <= 100
numerical_continuous_missing_leq_100 = ['Var6', 'Var13', 'Var21', 'Var74', 'Var81', 'Var119', 'Var125', 'Var140']
# Perform median imputation for the specified variables
for col in numerical_continuous_missing_leq_100:
    if col in cleaned_data.columns:
        median_value = cleaned_data[col].median()
        cleaned_data[col] = cleaned_data[col].fillna(median_value)

In [None]:
# Numerical continuous variables with missing values > 100 and <= 1000
numerical_continuous_missing_between_100_and_1000 = ['Var24', 'Var109', 'Var149']
# Numerical continuous variables with missing values > 1000
numerical_continuous_missing_gt_1000 = ['Var126']
# Numerical ordinal variables with a common divisor of 7
numerical_ordinal_divisor_7_group = ['Var7']
# Numerical ordinal variables with a common divisor of 9
numerical_ordinal_divisor_9_group = ['Var144', 'Var65']

# Create a SimpleImputer object with the median strategy
median_imputer = SimpleImputer(strategy='median')

# Apply the imputation on the selected columns
cleaned_data[numerical_continuous_missing_gt_1000] = median_imputer.fit_transform(cleaned_data[numerical_continuous_missing_gt_1000])
cleaned_data[numerical_continuous_missing_between_100_and_1000] = median_imputer.fit_transform(cleaned_data[numerical_continuous_missing_between_100_and_1000])
cleaned_data[numerical_ordinal_divisor_7_group] = median_imputer.fit_transform(cleaned_data[numerical_ordinal_divisor_7_group])
cleaned_data[numerical_ordinal_divisor_9_group] = median_imputer.fit_transform(cleaned_data[numerical_ordinal_divisor_9_group])

# Apply rounding to nearest multiple of 7 for Var7
cleaned_data[numerical_ordinal_divisor_7_group] = cleaned_data[numerical_ordinal_divisor_7_group].apply(lambda x: round_to_nearest_multiple(x, 7))

# Apply rounding to nearest multiple of 9 for Var144 and Var65
for var in numerical_ordinal_divisor_9_group:
    cleaned_data[var] = cleaned_data[var].apply(lambda x: round_to_nearest_multiple(x, 9))


#### Categorical Imputation

In [None]:
# CATEGORICAL
# Categorical continuous variables
categorical_continuous_variables = ['Var192', 'Var197', 'Var217']

# Create SimpleImputer object with the most frequent strategy
mode_imputer = SimpleImputer(strategy='most_frequent')
cleaned_data[categorical_continuous_variables] = mode_imputer.fit_transform(cleaned_data[categorical_continuous_variables])

# Categorical variables with few unique values and up to 345 missing
categorical_variables_few_unique_missing_leq_345 = ['Var203', 'Var205', 'Var218', 'Var208']
cleaned_data[categorical_variables_few_unique_missing_leq_345] = mode_imputer.fit_transform(cleaned_data[categorical_variables_few_unique_missing_leq_345])

# Categorical variables with many unique values and little missing
categorical_variables_many_unique_missing_low = ['Var206']
# Apply the imputation
constant_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
cleaned_data[categorical_variables_many_unique_missing_low] = constant_imputer.fit_transform(cleaned_data[categorical_variables_many_unique_missing_low])

# Categorical variables with many unique values and many missing
categorical_variables_many_unique_missing_high = ['Var219']
# Apply the KNN imputation on categorical variables with many unique values
cleaned_data[categorical_variables_many_unique_missing_high] = constant_imputer.fit_transform(cleaned_data[categorical_variables_many_unique_missing_high])

# Categorical variables with few unique values and many missing
categorical_variables_few_unique_missing_high = ['Var223']
# Apply most frequent imputation
cleaned_data[categorical_variables_few_unique_missing_high] = mode_imputer.fit_transform(cleaned_data[categorical_variables_few_unique_missing_high])

### Encoding

In [None]:
imputed_data = cleaned_data
imputed_data.to_csv('fully_imputed_data.csv', index=False)
export_feature_summary(cleaned_data, 'feature_summary.csv', unique_threshold = 40)
summary_df = pd.read_csv('feature_summary.csv')
split_dataset_by_missing_and_type(summary_df)

In [None]:
binary_features_names = ['Var208', 'Var218', 'Var218']
label_encoders = {}
for feature in binary_features_names:
    label_encoders[feature] = LabelEncoder()
    imputed_data[feature] = label_encoders[feature].fit_transform(imputed_data[feature])


In [None]:
one_hot_encoding_names = ['Var196', 'Var205', 'Var223', 'Var203', 'Var210', 'Var221', 'Var227', 'Var207']
# Apply one-hot encoding using pd.get_dummies
imputed_data_one_hot_encoded = pd.get_dummies(imputed_data, columns=one_hot_encoding_names)

In [None]:
freequency_encoding_names = ['Var219', 'Var206', 'Var226', 'Var228', 'Var193', 'Var212', 'Var204']
# Apply frequency encoding to each feature
for feature in freequency_encoding_names:
    frequency_map = imputed_data[feature].value_counts() / len(imputed_data)
    imputed_data[feature] = imputed_data[feature].map(frequency_map)

# Display the first few rows of the dataset after frequency encoding
print(imputed_data[freequency_encoding_names].head())

In [None]:
target_encoding_names = ['Var197', 'Var192']
# Apply target encoding
for feature in target_encoding_names:
    # Create a mapping from category to mean target value
    target_mean = imputed_data.groupby(feature)['y'].mean()
    imputed_data[feature] = imputed_data[feature].map(target_mean)

# Display the first few rows of the dataset after target encoding
print(imputed_data[target_encoding_names].head())

In [None]:
smoothning_target_encoding_names = ['Var220', 'Var198', 'Var216', 'Var199', 'Var222']
# Assuming your target variable is called 'y' (replace 'y' with the actual target column name)
global_mean = imputed_data['y'].mean()

# Define a smoothing factor (you can adjust this value)
alpha = 10

# Apply smoothing target encoding
for feature in smoothning_target_encoding_names:
    # Calculate category mean and the count of occurrences
    category_stats = imputed_data.groupby(feature)['y'].agg(['mean', 'count'])
    
    # Apply the smoothing formula
    smooth = (category_stats['count'] * category_stats['mean'] + alpha * global_mean) / (category_stats['count'] + alpha)
    
    # Map the smoothed values back to the original feature
    imputed_data[feature] = imputed_data[feature].map(smooth)

# Display the first few rows of the dataset after smoothing target encoding
print(imputed_data[smoothning_target_encoding_names].head())

In [None]:
hashing_encoding_names = ['Var202', 'Var217']
# Define the number of buckets for the hash function (adjust this value as needed)
n_buckets = 100

# Function to apply hashing encoding
def hash_encode(value, n_buckets):
    return int(hashlib.md5(str(value).encode()).hexdigest(), 16) % n_buckets

# Apply hashing encoding to each feature
for feature in hashing_encoding_names:
    imputed_data[feature + '_hashed'] = imputed_data[feature].apply(lambda x: hash_encode(x, n_buckets))

# Display the first few rows of the dataset after hashing encoding
print(imputed_data[[feature + '_hashed' for feature in hashing_encoding_names]].head())


### Normalising Numeric Features

In [None]:
from scipy.stats.mstats import winsorize

# Function to apply Winsorization to numeric features
def winsorize_features(df, features, limits=(0.05, 0.05)):
    # Winsorize each feature with the given limits (5% on both sides by default)
    for feature in features:
        df[feature] = winsorize(df[feature], limits=limits)
    return df

# Apply Winsorization to the numeric features with 5% limits on both sides
winsorized_data = winsorize_features(imputed_data, numeric_features, limits=(0.05, 0.05))

# Check the result
print(f"Data shape after Winsorizing: {winsorized_data.shape}")

In [None]:
min_max_scaling = ['Var173']

scaler = MinMaxScaler()
imputed_data[min_max_scaling] = scaler.fit_transform(winsorized_data[min_max_scaling])

In [None]:
features_to_scale = ['Var143', 'Var7', 'Var181', 'Var44']
imputed_data[features_to_scale] = scaler.fit_transform(winsorized_data[features_to_scale])


In [None]:
# Define the features to scale using StandardScaler
features_to_standardize = ['Var144', 'Var35', 'Var78', 'Var65', 'Var132']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply the scaler to the selected columns
imputed_data[features_to_standardize] = scaler.fit_transform(winsorized_data[features_to_standardize])

# Check the standardized data
print(winsorized_data[features_to_standardize].head())

In [None]:

# List of features to apply PowerTransformer
features_to_transform = [
    'Var126', 'Var24', 'Var85', 'Var83', 'Var109', 
    'Var73', 'Var112', 'Var25', 'Var123', 'Var160', 'Var74',
    'Var21', 'Var22', 'Var119', 'Var6', 'Var140', 'Var13', 'Var28', 
    'Var125', 'Var149', 'Var163', 'Var76', 'Var38', 'Var134', 'Var133', 
    'Var57', 'Var153', 'Var81', 'Var113'
]

# Initialize PowerTransformer (Yeo-Johnson is used by default)
pt = PowerTransformer(method='yeo-johnson')

# Apply PowerTransformer to the selected features
imputed_data[features_to_transform] = pt.fit_transform(winsorized_data[features_to_transform])

# Check the result (first few rows) after transformation
winsorized_data[features_to_transform].head()

# Initialize StandardScaler
scaler = StandardScaler()

# Apply StandardScaler to the transformed features
winsorized_data[features_to_transform] = scaler.fit_transform(winsorized_data[features_to_transform])

# Check the result after scaling
winsorized_data[features_to_transform].head()


In [None]:
winsorized_data.shape

In [None]:
# Підрахунок кількості кожного класу в цільовій змінній y
class_counts = winsorized_data['y'].value_counts()

# Відсотковий розподіл кожного класу
class_percentage = winsorized_data['y'].value_counts(normalize=True) * 100

# Виведення результатів
print(f"Кількість класів:\n{class_counts}")
print(f"\nВідсоткове співвідношення класів:\n{class_percentage}")