In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from feature_summary import export_feature_summary
from round_to_nearest import round_to_nearest_multiple
from split_dataset_by_missing_and_type import split_dataset_by_missing_and_type
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('final_proj_data.csv')

### Removing Missing Rows and Columns

In [3]:
print(data.shape)
column_threshold = 0.30

missing_percent = data.isnull().mean()
columns_to_drop = missing_percent[missing_percent > column_threshold].index
cleaned_data = data.drop(columns=columns_to_drop)

print(f"Columns removed: {list(columns_to_drop)}")
print(f"New dataset shape after column removal: {cleaned_data.shape}")

row_threshold = int((1 - 0.30) * cleaned_data.shape[1])
cleaned_data = cleaned_data.dropna(thresh=row_threshold)
print(f"New dataset shape after removing rows with more than 30% missing columns: {cleaned_data.shape}")

cleaned_data.to_csv('fully_cleaned_data.csv', index=False)

(10000, 231)
Columns removed: ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var8', 'Var9', 'Var10', 'Var11', 'Var12', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var23', 'Var26', 'Var27', 'Var29', 'Var30', 'Var31', 'Var32', 'Var33', 'Var34', 'Var36', 'Var37', 'Var39', 'Var40', 'Var41', 'Var42', 'Var43', 'Var45', 'Var46', 'Var47', 'Var48', 'Var49', 'Var50', 'Var51', 'Var52', 'Var53', 'Var54', 'Var55', 'Var56', 'Var58', 'Var59', 'Var60', 'Var61', 'Var62', 'Var63', 'Var64', 'Var66', 'Var67', 'Var68', 'Var69', 'Var70', 'Var71', 'Var72', 'Var75', 'Var77', 'Var79', 'Var80', 'Var82', 'Var84', 'Var86', 'Var87', 'Var88', 'Var89', 'Var90', 'Var91', 'Var92', 'Var93', 'Var94', 'Var95', 'Var96', 'Var97', 'Var98', 'Var99', 'Var100', 'Var101', 'Var102', 'Var103', 'Var104', 'Var105', 'Var106', 'Var107', 'Var108', 'Var110', 'Var111', 'Var114', 'Var115', 'Var116', 'Var117', 'Var118', 'Var120', 'Var121', 'Var122', 'Var124', 'Var127', 'Var128', 'Var129', 'Var130', 'Var131', 'Var135', 'Var1

### Imputation

In [4]:
# Separate numeric and categorical feature names
numeric_features = cleaned_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = cleaned_data.select_dtypes(include=['object']).columns

export_feature_summary(cleaned_data, 'feature_summary.csv', unique_threshold = 40)
summary_df = pd.read_csv('feature_summary.csv')

Feature summary exported to feature_summary.csv


In [5]:
# MISSING VALUES
features_with_missing_values_count = summary_df[summary_df['Missing Value Count'] > 0].shape[0]
print(features_with_missing_values_count)
split_dataset_by_missing_and_type(summary_df)

25


'Files saved successfully.'

In [6]:
# Categorical variables with no missing values
categorical_variables_no_missing = [
    'Var196', 'Var207', 'Var226', 'Var211', 'Var198', 'Var199', 
    'Var202', 'Var204', 'Var212', 'Var216', 'Var220', 'Var222', 
    'Var227', 'Var193', 'Var228', 'Var195', 'Var210', 'Var221'
]

# Numerical variables with no missing values
numerical_variables_no_missing = [
    'y', 'Var181', 'Var173', 'Var35', 'Var143', 'Var132', 'Var78', 'Var44',
    'Var163', 'Var160', 'Var153', 'Var134', 'Var133', 'Var123', 'Var22', 'Var85',
    'Var83', 'Var76', 'Var73', 'Var57', 'Var38', 'Var28', 'Var25', 'Var113', 'Var112'
]

#### Numerical Imputation

In [7]:
# Numerical continuous variables with missing values <= 100
numerical_continuous_missing_leq_100 = ['Var6', 'Var13', 'Var21', 'Var74', 'Var81', 'Var119', 'Var125', 'Var140']
# Perform median imputation for the specified variables
for col in numerical_continuous_missing_leq_100:
    if col in cleaned_data.columns:
        median_value = cleaned_data[col].median()
        cleaned_data[col] = cleaned_data[col].fillna(median_value)

In [8]:
# Numerical continuous variables with missing values > 100 and <= 1000
numerical_continuous_missing_between_100_and_1000 = ['Var24', 'Var109', 'Var149']
# Numerical continuous variables with missing values > 1000
numerical_continuous_missing_gt_1000 = ['Var126']
# Numerical ordinal variables with a common divisor of 7
numerical_ordinal_divisor_7_group = ['Var7']
# Numerical ordinal variables with a common divisor of 9
numerical_ordinal_divisor_9_group = ['Var144', 'Var65']

# Create a SimpleImputer object with the median strategy
median_imputer = SimpleImputer(strategy='median')

# Apply the imputation on the selected columns
cleaned_data[numerical_continuous_missing_gt_1000] = median_imputer.fit_transform(cleaned_data[numerical_continuous_missing_gt_1000])
cleaned_data[numerical_continuous_missing_between_100_and_1000] = median_imputer.fit_transform(cleaned_data[numerical_continuous_missing_between_100_and_1000])
cleaned_data[numerical_ordinal_divisor_7_group] = median_imputer.fit_transform(cleaned_data[numerical_ordinal_divisor_7_group])
cleaned_data[numerical_ordinal_divisor_9_group] = median_imputer.fit_transform(cleaned_data[numerical_ordinal_divisor_9_group])

# Apply rounding to nearest multiple of 7 for Var7
cleaned_data[numerical_ordinal_divisor_7_group] = cleaned_data[numerical_ordinal_divisor_7_group].apply(lambda x: round_to_nearest_multiple(x, 7))

# Apply rounding to nearest multiple of 9 for Var144 and Var65
for var in numerical_ordinal_divisor_9_group:
    cleaned_data[var] = cleaned_data[var].apply(lambda x: round_to_nearest_multiple(x, 9))


#### Categorical Imputation

In [9]:
# CATEGORICAL
# Categorical continuous variables
categorical_continuous_variables = ['Var192', 'Var197', 'Var217']

# Create SimpleImputer object with the most frequent strategy
mode_imputer = SimpleImputer(strategy='most_frequent')
cleaned_data[categorical_continuous_variables] = mode_imputer.fit_transform(cleaned_data[categorical_continuous_variables])

# Categorical variables with few unique values and up to 345 missing
categorical_variables_few_unique_missing_leq_345 = ['Var203', 'Var205', 'Var218', 'Var208']
cleaned_data[categorical_variables_few_unique_missing_leq_345] = mode_imputer.fit_transform(cleaned_data[categorical_variables_few_unique_missing_leq_345])

# Categorical variables with many unique values and little missing
categorical_variables_many_unique_missing_low = ['Var206']
# Apply the imputation
constant_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
cleaned_data[categorical_variables_many_unique_missing_low] = constant_imputer.fit_transform(cleaned_data[categorical_variables_many_unique_missing_low])

# Categorical variables with many unique values and many missing
categorical_variables_many_unique_missing_high = ['Var219']
# Apply the KNN imputation on categorical variables with many unique values
cleaned_data[categorical_variables_many_unique_missing_high] = constant_imputer.fit_transform(cleaned_data[categorical_variables_many_unique_missing_high])

# Categorical variables with few unique values and many missing
categorical_variables_few_unique_missing_high = ['Var223']
# Apply most frequent imputation
cleaned_data[categorical_variables_few_unique_missing_high] = mode_imputer.fit_transform(cleaned_data[categorical_variables_few_unique_missing_high])

### Encoding

In [21]:
imputed_data = cleaned_data
imputed_data.to_csv('fully_imputed_data.csv', index=False)
export_feature_summary(cleaned_data, 'feature_summary.csv', unique_threshold = 40)
summary_df = pd.read_csv('feature_summary.csv')
split_dataset_by_missing_and_type(summary_df)

Feature summary exported to feature_summary.csv


'Files saved successfully.'

In [14]:
binary_features_names = ['Var208', 'Var218', 'Var218']
label_encoders = {}
for feature in binary_features_names:
    label_encoders[feature] = LabelEncoder()
    imputed_data[feature] = label_encoders[feature].fit_transform(imputed_data[feature])


Unnamed: 0,Var208,Var218,Var218.1
0,0,1,1
1,0,0,0
2,0,1,1
3,0,1,1
4,0,1,1


In [15]:
one_hot_encoding_names = ['Var196', 'Var205', 'Var223', 'Var203', 'Var210', 'Var221', 'Var227', 'Var207']
# Apply one-hot encoding using pd.get_dummies
imputed_data_one_hot_encoded = pd.get_dummies(imputed_data, columns=one_hot_encoding_names)

Unnamed: 0,Var6,Var7,Var13,Var21,Var22,Var24,Var25,Var28,Var35,Var38,...,Var207_5iay,Var207_6C53VA1kCv,Var207_7M47J5GA0pTYIFxg5uy,Var207_DHn_WUyBhW_whjA88g9bvA64_,Var207_EBKcR3s6B22tD6gC36gm6S,Var207_GjJ35utlTa_GNSvxxpb9ju,Var207_Kxdu,Var207_NKv3VA1BpP,Var207_me75fM6ugJ,Var207_wXfldy7
0,812.0,14.0,1252.0,156.0,195.0,0.0,40.0,286.96,0.0,4850466.0,...,False,False,True,False,False,False,False,False,False,False
1,2688.0,7.0,8820.0,364.0,455.0,4.0,288.0,200.0,0.0,132072.0,...,False,False,False,False,False,False,False,False,True,False
2,1015.0,14.0,1784.0,136.0,170.0,2.0,40.0,294.48,0.0,3223524.0,...,False,False,True,False,False,False,False,False,False,False
3,168.0,0.0,0.0,24.0,30.0,0.0,0.0,644.24,0.0,2135430.0,...,False,False,False,False,False,False,False,False,True,False
4,14.0,0.0,0.0,36.0,45.0,0.0,0.0,239.84,0.0,3110400.0,...,False,False,False,False,False,False,False,False,True,False


In [16]:
freequency_encoding_names = ['Var195', 'Var219', 'Var206', 'Var226', 'Var228', 'Var193', 'Var212', 'Var204']
# Apply frequency encoding to each feature
for feature in freequency_encoding_names:
    frequency_map = imputed_data[feature].value_counts() / len(imputed_data)
    imputed_data[feature] = imputed_data[feature].map(frequency_map)

# Display the first few rows of the dataset after frequency encoding
print(imputed_data[freequency_encoding_names].head())

    Var195    Var219    Var206    Var226    Var228    Var193    Var212  \
0  0.95793  0.013106  0.010683  0.050771  0.062445  0.014978  0.007819   
1  0.95793  0.020595  0.019934  0.076762  0.103965  0.173348  0.153744   
2  0.95793  0.821476  0.065529  0.050771  0.038767  0.672026  0.002093   
3  0.95793  0.821476  0.129736  0.155837  0.595485  0.672026  0.526432   
4  0.95793  0.821476  0.383590  0.076762  0.595485  0.672026  0.526432   

     Var204  
0  0.020154  
1  0.019383  
2  0.020154  
3  0.022026  
4  0.019383  


In [18]:
target_encoding_names = ['Var197', 'Var192']
# Apply target encoding
for feature in target_encoding_names:
    # Create a mapping from category to mean target value
    target_mean = imputed_data.groupby(feature)['y'].mean()
    imputed_data[feature] = imputed_data[feature].map(target_mean)

# Display the first few rows of the dataset after target encoding
print(imputed_data[target_encoding_names].head())

     Var197    Var192
0  0.098361  0.140000
1  0.126829  0.046512
2  0.134375  0.000000
3  0.134375  0.155556
4  0.117647  0.051282


In [19]:
smoothning_target_encoding_names = ['Var220', 'Var198', 'Var216', 'Var199', 'Var222']
# Assuming your target variable is called 'y' (replace 'y' with the actual target column name)
global_mean = imputed_data['y'].mean()

# Define a smoothing factor (you can adjust this value)
alpha = 10

# Apply smoothing target encoding
for feature in smoothning_target_encoding_names:
    # Calculate category mean and the count of occurrences
    category_stats = imputed_data.groupby(feature)['y'].agg(['mean', 'count'])
    
    # Apply the smoothing formula
    smooth = (category_stats['count'] * category_stats['mean'] + alpha * global_mean) / (category_stats['count'] + alpha)
    
    # Map the smoothed values back to the original feature
    imputed_data[feature] = imputed_data[feature].map(smooth)

# Display the first few rows of the dataset after smoothing target encoding
print(imputed_data[smoothning_target_encoding_names].head())

     Var220    Var198    Var216    Var199    Var222
0  0.002412  0.002412  0.110866  0.209440  0.002412
1  0.073911  0.073911  0.102338  0.243477  0.073911
2  0.022207  0.022207  0.110866  0.093412  0.022207
3  0.195906  0.195906  0.222026  0.253828  0.195906
4  0.083150  0.083150  0.270281  0.224574  0.083150


In [20]:
hashing_encoding_names = ['Var202', 'Var217']
import hashlib

# Define the number of buckets for the hash function (adjust this value as needed)
n_buckets = 100

# Function to apply hashing encoding
def hash_encode(value, n_buckets):
    return int(hashlib.md5(str(value).encode()).hexdigest(), 16) % n_buckets

# Apply hashing encoding to each feature
for feature in hashing_encoding_names:
    imputed_data[feature + '_hashed'] = imputed_data[feature].apply(lambda x: hash_encode(x, n_buckets))

# Display the first few rows of the dataset after hashing encoding
print(imputed_data[[feature + '_hashed' for feature in hashing_encoding_names]].head())


   Var202_hashed  Var217_hashed
0             89             95
1             20             71
2             28             77
3              6             35
4             76             68
