In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from scipy import stats

In [3]:
# Load the dataset
file_path = 'application_train.csv'
df = pd.read_csv(file_path)

# Display basic info and first few rows
print("Original DataFrame Info:")
print(df.info())
print("\nFirst few rows of the dataset:")
print(df.head())

Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None

First few rows of the dataset:
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3  

In [5]:
# 1. Data Cleaning: Removing duplicates and handling outliers
# Removing duplicate rows
df_cleaned = df.drop_duplicates().copy()

# Handling outliers using Z-score method
# Selecting numeric columns only
numeric_cols = df_cleaned.select_dtypes(include=[np.number]).columns
z_scores = np.abs(stats.zscore(df_cleaned[numeric_cols], nan_policy='omit'))
# Filter rows where all z-scores are less than 3 (indicating no extreme outliers)
df_cleaned = df_cleaned[(z_scores < 3).all(axis=1)].copy()


In [6]:
# 2. Transformation: Log and square root transformation (for demonstration, using 'AMT_INCOME_TOTAL')
df_cleaned['AMT_INCOME_TOTAL_log'] = np.log1p(df_cleaned['AMT_INCOME_TOTAL'].fillna(0))  # Log transformation
df_cleaned['AMT_INCOME_TOTAL_sqrt'] = np.sqrt(df_cleaned['AMT_INCOME_TOTAL'].fillna(0))  # Square root transformation


In [7]:
# 3. Normalization: Scaling data between 0 and 1
# For demonstration, normalizing 'AMT_CREDIT' and 'AMT_ANNUITY'
scaler = MinMaxScaler()
if df_cleaned[['AMT_CREDIT', 'AMT_ANNUITY']].dropna().shape[0] > 0:
    df_cleaned[['AMT_CREDIT_normalized', 'AMT_ANNUITY_normalized']] = scaler.fit_transform(df_cleaned[['AMT_CREDIT', 'AMT_ANNUITY']].fillna(0))


In [8]:
# 4. Standardization: Standardizing features to zero mean and unit variance
standardizer = StandardScaler()
if df_cleaned[['AMT_CREDIT', 'AMT_ANNUITY']].dropna().shape[0] > 0:
    df_cleaned[['AMT_CREDIT_standardized', 'AMT_ANNUITY_standardized']] = standardizer.fit_transform(df_cleaned[['AMT_CREDIT', 'AMT_ANNUITY']].fillna(0))


In [9]:
# 5. Encoding: Converting categorical variables into numerical values
# One-Hot Encoding for 'NAME_CONTRACT_TYPE'
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
if df_cleaned[['NAME_CONTRACT_TYPE']].dropna().shape[0] > 0:
    contract_type_encoded = onehot_encoder.fit_transform(df_cleaned[['NAME_CONTRACT_TYPE']].fillna('missing'))
    contract_type_encoded_df = pd.DataFrame(contract_type_encoded, columns=onehot_encoder.get_feature_names_out(['NAME_CONTRACT_TYPE']))
    df_encoded = pd.concat([df_cleaned, contract_type_encoded_df], axis=1)
else:
    df_encoded = df_cleaned.copy()

# Label Encoding for 'CODE_GENDER'
label_encoder = LabelEncoder()
if df_encoded[['CODE_GENDER']].dropna().shape[0] > 0:
    df_encoded['CODE_GENDER_encoded'] = label_encoder.fit_transform(df_encoded['CODE_GENDER'].fillna('missing'))




In [10]:
# 6. Imputation: Filling missing values with mean for numeric and 'missing' for categorical
imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing')

# Impute numeric columns
numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns
df_imputed = df_encoded.copy()
df_imputed[numeric_cols] = imputer_numeric.fit_transform(df_imputed[numeric_cols])

# Impute categorical columns
categorical_cols = df_encoded.select_dtypes(include=[object]).columns
df_imputed[categorical_cols] = imputer_categorical.fit_transform(df_imputed[categorical_cols])


In [11]:
# 7. Handling Missing Data: Dropping rows or columns with missing data
# Drop rows with any missing values
df_dropped = df_encoded.dropna()


In [12]:
# 8. Dimensionality Reduction: Applying PCA to reduce the number of features
# Selecting numeric columns and dropping any remaining NaNs
df_pca = df_imputed.select_dtypes(include=[np.number]).dropna()
if df_pca.shape[1] > 1:
    pca = PCA(n_components=2)
    df_pca_reduced = pd.DataFrame(pca.fit_transform(df_pca), columns=['PCA1', 'PCA2'])
else:
    df_pca_reduced = pd.DataFrame()  # Create an empty DataFrame if PCA cannot be applied


In [13]:
# 9. Data Integration: Demonstrating merging of datasets
# For illustration, we'll split the dataframe into two and merge them back
df_part1 = df_encoded.iloc[:len(df_encoded)//2, :]
df_part2 = df_encoded.iloc[len(df_encoded)//2:, :]
df_combined = pd.concat([df_part1, df_part2], axis=0)


In [14]:
# 10. Sampling: Randomly sample 100 rows from the dataset
sample_size = min(100, df_encoded.shape[0])
df_sampled = df_encoded.sample(n=sample_size)


In [15]:
# Outputs to see the transformations
print("\nCleaned DataFrame:")
print(df_cleaned.head())
print("\nEncoded DataFrame:")
print(df_encoded.head())
print("\nImputed DataFrame:")
print(df_imputed.head())
print("\nDropped Missing Data DataFrame:")
print(df_dropped.head())
print("\nPCA Reduced DataFrame:")
print(df_pca_reduced.head())
print("\nCombined DataFrame:")
print(df_combined.head())
print("\nSampled DataFrame:")
print(df_sampled.head())


Cleaned DataFrame:
     SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
71       100083       0         Cash loans           M            Y   
124      100145       0         Cash loans           F            Y   
143      100165       0         Cash loans           F            Y   
249      100289       0         Cash loans           M            Y   
296      100341       0         Cash loans           M            Y   

    FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
71                Y             0          103500.0    573628.5      24435.0   
124               Y             1          202500.0    260725.5      16789.5   
143               Y             0          175500.0   1293502.5      35568.0   
249               N             0          202500.0    526491.0      26878.5   
296               Y             0           76500.0    545040.0      20677.5   

     ...  AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON  \
71   .