In [1]:
import pandas as pd
from feature_summary import export_feature_summary
from split_dataset_by_missing_and_type import split_dataset_by_missing_and_type

### Пояснення до кожного файлу:

**final_proj_data.csv**: Це навчальний набір даних, який використовується для побудови та навчання вашої моделі машинного навчання.
**final_proj_test.csv**: Це тестовий набір даних, на якому ви генеруватимете прогнози для подальшої оцінки.

**final_proj_sample_submission.csv**: Це приклад правильного формату та структури для подання ваших прогнозів на Kaggle.

**final_project.ipynb**: Це Jupyter Notebook, де ви пишете код, тренуєте свою модель і генеруєте прогнози для змагання.

In [2]:
data = pd.read_csv('final_proj_data.csv')

### Removing Missing Rows and Columns

In [3]:
print(data.shape)
column_threshold = 0.30

missing_percent = data.isnull().mean()
columns_to_drop = missing_percent[missing_percent > column_threshold].index
cleaned_data = data.drop(columns=columns_to_drop)

print(f"Columns removed: {list(columns_to_drop)}")
print(f"New dataset shape after column removal: {cleaned_data.shape}")

row_threshold = int((1 - 0.30) * cleaned_data.shape[1])
cleaned_data = cleaned_data.dropna(thresh=row_threshold)
print(f"New dataset shape after removing rows with more than 30% missing columns: {cleaned_data.shape}")

cleaned_data.to_csv('fully_cleaned_data.csv', index=False)

(10000, 231)
Columns removed: ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var8', 'Var9', 'Var10', 'Var11', 'Var12', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var23', 'Var26', 'Var27', 'Var29', 'Var30', 'Var31', 'Var32', 'Var33', 'Var34', 'Var36', 'Var37', 'Var39', 'Var40', 'Var41', 'Var42', 'Var43', 'Var45', 'Var46', 'Var47', 'Var48', 'Var49', 'Var50', 'Var51', 'Var52', 'Var53', 'Var54', 'Var55', 'Var56', 'Var58', 'Var59', 'Var60', 'Var61', 'Var62', 'Var63', 'Var64', 'Var66', 'Var67', 'Var68', 'Var69', 'Var70', 'Var71', 'Var72', 'Var75', 'Var77', 'Var79', 'Var80', 'Var82', 'Var84', 'Var86', 'Var87', 'Var88', 'Var89', 'Var90', 'Var91', 'Var92', 'Var93', 'Var94', 'Var95', 'Var96', 'Var97', 'Var98', 'Var99', 'Var100', 'Var101', 'Var102', 'Var103', 'Var104', 'Var105', 'Var106', 'Var107', 'Var108', 'Var110', 'Var111', 'Var114', 'Var115', 'Var116', 'Var117', 'Var118', 'Var120', 'Var121', 'Var122', 'Var124', 'Var127', 'Var128', 'Var129', 'Var130', 'Var131', 'Var135', 'Var1

In [4]:
# Separate numeric and categorical feature names
numeric_features = cleaned_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = cleaned_data.select_dtypes(include=['object']).columns

export_feature_summary(cleaned_data, 'feature_summary.csv', unique_threshold = 40)
summary_df = pd.read_csv('feature_summary.csv')

Feature summary exported to feature_summary.csv


In [5]:
# MISSING VALUES
features_with_missing_values_count = summary_df[summary_df['Missing Value Count'] > 0].shape[0]
print(features_with_missing_values_count)
split_dataset_by_missing_and_type(summary_df)

25


'Files saved successfully.'

In [None]:
# Categorical variables with no missing values
categorical_variables_no_missing = [
    'Var196', 'Var207', 'Var226', 'Var211', 'Var198', 'Var199', 
    'Var202', 'Var204', 'Var212', 'Var216', 'Var220', 'Var222', 
    'Var227', 'Var193', 'Var228', 'Var195', 'Var210', 'Var221'
]

# Numerical variables with no missing values
numerical_variables_no_missing = [
    'y', 'Var181', 'Var173', 'Var35', 'Var143', 'Var132', 'Var78', 'Var44',
    'Var163', 'Var160', 'Var153', 'Var134', 'Var133', 'Var123', 'Var22', 'Var85',
    'Var83', 'Var76', 'Var73', 'Var57', 'Var38', 'Var28', 'Var25', 'Var113', 'Var112'
]

# Numerical continuous variables with missing values <= 100
numerical_continuous_missing_leq_100 = ['Var6', 'Var13', 'Var21', 'Var74', 'Var81', 'Var119', 'Var125', 'Var140']

# Numerical continuous variables with missing values > 100 and <= 1000
numerical_continuous_missing_between_100_and_1000 = ['Var24', 'Var109', 'Var149']

# Numerical continuous variables with missing values > 1000
numerical_continuous_missing_gt_1000 = ['Var126']

# Numerical ordinal variables with a common divisor of 7
numerical_ordinal_divisor_7_group = ['Var7']

# Numerical ordinal variables with a common divisor of 9
numerical_ordinal_divisor_9_group = ['Var144', 'Var65']

# Categorical continuous variables
categorical_continuous_variables = ['Var192', 'Var197', 'Var217']

# Categorical variables with few unique values and up to 345 missing
categorical_variables_few_unique_missing_leq_345 = ['Var203', 'Var205', 'Var218', 'Var208']

# Categorical variables with many unique values and little missing
categorical_variables_many_unique_missing_low = ['Var206']

# Categorical variables with many unique values and many missing
categorical_variables_many_unique_missing_high = ['Var219']

# Categorical variables with few unique values and many missing
categorical_variables_few_unique_missing_high = ['Var223']