In [7]:
# Let's reload the data and start fresh
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the original data
df = pd.read_csv("GermanCredit.csv")

# Let's first examine the data to understand its structure
print("Original DataFrame Info:")
print(f"Shape: {df.shape}")
print(f"Column names: {df.columns.tolist()}")
print("\nSample data:")
print(df.head())

# Check for 'none' values in each column
print("\nColumns with 'none' values:")
none_counts = {}
for col in df.columns:
    none_count = df[col].astype(str).str.lower().str.count('none').sum()
    if none_count > 0:
        none_counts[col] = none_count
        print(f"  - {col}: {none_count} none values")

# Function to find and drop columns with most 'none' values
def drop_least_contributing_columns(dataframe, n=3):
    none_counts = {}
    for col in dataframe.columns:
        # Count non-zero 'none' values (case insensitive)
        none_count = dataframe[col].astype(str).str.lower().str.count('none').sum()
        none_counts[col] = none_count
    
    # Sort by count (highest first) and then by column name
    sorted_columns = sorted(none_counts.items(), key=lambda x: (-x[1], x[0]))
    
    # Get the n columns with highest 'none' counts
    columns_to_drop = [col for col, count in sorted_columns[:n]]
    
    print(f"Dropping columns: {columns_to_drop}")
    return dataframe.drop(columns=columns_to_drop)

# Drop the 3 least contributing columns
df_cleaned = drop_least_contributing_columns(df, n=3)

# Now let's check if checking_status and savings_status are still in the DataFrame
print("\nAfter dropping columns:")
print(f"Shape: {df_cleaned.shape}")
print(f"Column names: {df_cleaned.columns.tolist()}")

# Check unique values in these columns
print("\nUnique values in checking_status:")
print(df_cleaned['checking_status'].unique() if 'checking_status' in df_cleaned.columns else "Column not found")

print("\nUnique values in savings_status:")
print(df_cleaned['savings_status'].unique() if 'savings_status' in df_cleaned.columns else "Column not found")

Original DataFrame Info:
Shape: (1000, 21)
Column names: ['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker', 'class']

Sample data:
  checking_status  duration                    credit_history  \
0            '<0'         6  'critical/other existing credit'   
1      '0<=X<200'        48                   'existing paid'   
2   'no checking'        12  'critical/other existing credit'   
3            '<0'        42                   'existing paid'   
4            '<0'        24              'delayed previously'   

               purpose  credit_amount      savings_status employment  \
0             radio/tv           1169  'no known savings'      '>=7'   
1             radio/tv           5951              

In [9]:
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'object':
        df_cleaned[col] = df_cleaned[col].str.replace("'", "")
print("After removing apostrophes")
print(df_cleaned['checking_status'].unique())
print(df_cleaned['savings_status'].unique())

#map checking_status values
checking_status_mapping = {
    'no checking': 'No Checking',
    '<0': 'Low',
    '0<=X<200': 'Medium',
    '>=200': 'High'
}
df_cleaned['checking_status'] = df_cleaned['checking_status'].map(checking_status_mapping)

print("\nAfter mapping checking_status:")
print(df_cleaned['checking_status'].value_counts())

#map savings_status values
saving_status_mapping = {
    'no known savings': 'No Savings',
    '<100': 'Low',
    '100<=X<500': 'Medium',
    '500<=X<1000': 'High',
    '>=1000': 'High'
}
df_cleaned['savings_status'] = df_cleaned['savings_status'].map(saving_status_mapping)

print("\nAfter mapping savings_status:")
print(df_cleaned['savings_status'].value_counts())

After removing apostrophes
['<0' '0<=X<200' 'no checking' '>=200']
['no known savings' '<100' '500<=X<1000' '>=1000' '100<=X<500']

After mapping checking_status:
checking_status
No Checking    394
Low            274
Medium         269
High            63
Name: count, dtype: int64

After mapping savings_status:
savings_status
Low           603
No Savings    183
High          111
Medium        103
Name: count, dtype: int64
