# Credit Score Classification


## Data Loading

This is where I will load in the csv files containing the data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('capstone_train_data.csv')

  train = pd.read_csv('capstone_train_data.csv')


In [3]:
train.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529544,High_spent_Small_value_payments,312.4940887,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.2802216,Low_spent_Large_value_payments,284.6291625,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.69952126,Low_spent_Medium_value_payments,331.2098629,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580744,Low_spent_Small_value_payments,223.4513097,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.42015309,High_spent_Medium_value_payments,341.489231,Good


In [5]:
# Display the data types of each column
print(train.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

## Data Cleaning

### Cleaning the training data in this section.

In [6]:
train.isnull().sum()

ID                              0
Customer_ID                     0
Month                           0
Name                         9985
Age                             0
SSN                             0
Occupation                      0
Annual_Income                   0
Monthly_Inhand_Salary       15002
Num_Bank_Accounts               0
Num_Credit_Card                 0
Interest_Rate                   0
Num_of_Loan                     0
Type_of_Loan                11408
Delay_from_due_date             0
Num_of_Delayed_Payment       7002
Changed_Credit_Limit            0
Num_Credit_Inquiries         1965
Credit_Mix                      0
Outstanding_Debt                0
Credit_Utilization_Ratio        0
Credit_History_Age           9030
Payment_of_Min_Amount           0
Total_EMI_per_month             0
Amount_invested_monthly      4479
Payment_Behaviour               0
Monthly_Balance              1200
Credit_Score                    0
dtype: int64

There are lots of names missing in the dataset. Since each name has an associated Customer_ID, it is safe to assume that we can fill in the name based on the Customer_ID for that entry. 

In [7]:
import pandas as pd

# Create a dictionary mapping customer IDs to names
id_to_name = train.groupby('Customer_ID')['Name'].first().to_dict()

# Function to fill missing names based on Customer_ID
def fill_missing_names(row):
    if pd.isna(row['Name']):
        return id_to_name.get(row['Customer_ID'], 'Unknown')
    else:
        return row['Name']

# Apply the function to fill missing names
train['Name'] = train.apply(fill_missing_names, axis=1)


In [8]:
train["Name"].isnull().sum()

0

In the Age column, there are sometimes random - or _ symbols before or after the age that should be removed, so that we can group applicants by age later on. There are also some unrealistic age values (such as 500) that should be changed to the correct age of the person, which can be seen in the other entries. 

In [9]:
# Show the problem
print(train[['Name', 'Age']].head(20))

               Name   Age
0     Aaron Maashoh    23
1     Aaron Maashoh    23
2     Aaron Maashoh  -500
3     Aaron Maashoh    23
4     Aaron Maashoh    23
5     Aaron Maashoh    23
6     Aaron Maashoh    23
7     Aaron Maashoh    23
8   Rick Rothackerj   28_
9   Rick Rothackerj    28
10  Rick Rothackerj    28
11  Rick Rothackerj    28
12  Rick Rothackerj    28
13  Rick Rothackerj    28
14  Rick Rothackerj    28
15  Rick Rothackerj    28
16           Langep    34
17           Langep    34
18           Langep    34
19           Langep    34


In [10]:
# This section will handle the - marks 

# Remove non-numeric characters from the Age column
train['Age'] = train['Age'].str.replace(r'\D', '', regex=True)

# Convert the Age column to numeric type
# Errors = 'coerce' makes sure that all non-convertible values are changed to NaN
train['Age'] = pd.to_numeric(train['Age'], errors='coerce')

# Define the threshold for the age
threshold = 90

# Iterate over rows and update Age if it's greater than the threshold
for index, row in train.iterrows():
    if row['Age'] > threshold:
        name = row['Name']
        prev_index = index - 1
        next_index = index + 1
        
        # Find the previous valid age value with the same name
        while prev_index >= 0 and train.at[prev_index, 'Name'] == name:
            if pd.notnull(train.at[prev_index, 'Age']) and train.at[prev_index, 'Age'] <= threshold:
                train.at[index, 'Age'] = train.at[prev_index, 'Age']
                break
            prev_index -= 1
        
        # Find the next valid age value with the same name
        while next_index < len(train) and train.at[next_index, 'Name'] == name:
            if pd.notnull(train.at[next_index, 'Age']) and train.at[next_index, 'Age'] <= threshold:
                train.at[index, 'Age'] = train.at[next_index, 'Age']
                break
            next_index += 1

# Verify unique values after handling unrealistic ages
unique_ages = train['Age'].unique()
print(unique_ages)


[23 28 34 54 55 21 31 33 30 24 44 45 40 41 32 35 36 39 37 20 46 26 42 19
 48 38 43 22 16 18 15 27 25 14 17 47 53 56 29 49 51 50 52]


In [11]:
# Print the updated DataFrame
print(train[['Name', 'Age']].head(20))

               Name  Age
0     Aaron Maashoh   23
1     Aaron Maashoh   23
2     Aaron Maashoh   23
3     Aaron Maashoh   23
4     Aaron Maashoh   23
5     Aaron Maashoh   23
6     Aaron Maashoh   23
7     Aaron Maashoh   23
8   Rick Rothackerj   28
9   Rick Rothackerj   28
10  Rick Rothackerj   28
11  Rick Rothackerj   28
12  Rick Rothackerj   28
13  Rick Rothackerj   28
14  Rick Rothackerj   28
15  Rick Rothackerj   28
16           Langep   34
17           Langep   34
18           Langep   34
19           Langep   34


The SSN category also sometimes contains non-numerical entries, so we will fix them in a similar way to how we fixed the Name column, the only difference being first we will make the invalid entries null values, and then fill the null values appropriately. 

In [12]:
# Show the problem
print(train[['SSN']].head(10))

           SSN
0  821-00-0265
1  821-00-0265
2  821-00-0265
3  821-00-0265
4  821-00-0265
5  821-00-0265
6  821-00-0265
7    #F%$D@*&8
8  004-07-5839
9  004-07-5839


In [13]:
# Define a function to check if SSN starts with a digit (meaning it is valid)
def check_ssn(ssn):
    if not ssn[0].isdigit():
        return None
    else:
        return ssn

# Apply the function to the 'SSN' column
train['SSN'] = train['SSN'].apply(check_ssn)

In [14]:
# Create a dictionary mapping customer IDs to SSNs
id_to_name = train.groupby('Customer_ID')['SSN'].first().to_dict()

# Function to fill missing SSNs based on Customer_ID
def fill_missing_ssns(row):
    if pd.isna(row['SSN']):
        return id_to_name.get(row['Customer_ID'], 'Unknown')
    else:
        return row['SSN']

# Apply the function to fill missing SSNs
train['SSN'] = train.apply(fill_missing_ssns, axis=1)

In [15]:
print(train[['SSN']].head(10))

           SSN
0  821-00-0265
1  821-00-0265
2  821-00-0265
3  821-00-0265
4  821-00-0265
5  821-00-0265
6  821-00-0265
7  821-00-0265
8  004-07-5839
9  004-07-5839


We have a similar problem yet again with the occupation column, in which some entries that appear to be filled are actually ______ lines. 

In [16]:
# Show the problem
print(train[['Occupation']].head(10))

  Occupation
0  Scientist
1  Scientist
2  Scientist
3  Scientist
4  Scientist
5  Scientist
6  Scientist
7  Scientist
8    _______
9    Teacher


In [17]:
# Define a function to check if the Occupation starts with a letter
def check_occupation(occupation):
    if not occupation[0].isalpha():
        return None
    else:
        return occupation

# Apply the function to the 'Occupation' column
train['Occupation'] = train['Occupation'].apply(check_occupation)

In [18]:
# Create a dictionary mapping customer IDs to occupations
id_to_name = train.groupby('Customer_ID')['Occupation'].first().to_dict()

# Function to fill missing names based on Customer_ID
def fill_missing_occupation(row):
    if pd.isna(row['Occupation']):
        return id_to_name.get(row['Customer_ID'], 'Unknown')
    else:
        return row['Occupation']

# Apply the function to fill missing occupation
train['Occupation'] = train.apply(fill_missing_occupation, axis=1)

In [19]:
print(train[['Occupation']].head(10))

  Occupation
0  Scientist
1  Scientist
2  Scientist
3  Scientist
4  Scientist
5  Scientist
6  Scientist
7  Scientist
8    Teacher
9    Teacher


The Annual Income column also has issues similar to the Age column, so we will once again apply a similar technique to this column in order to fix any odd symbols or unrealistic values that do not fit in with the rest of the data. 

In [20]:
# Show the problem
print(train['Annual_Income'].iloc[52:63])

52     131313.4
53     131313.4
54     10909427
55     131313.4
56    34081.38_
57     34081.38
58     34081.38
59     34081.38
60     34081.38
61     34081.38
62     34081.38
Name: Annual_Income, dtype: object


In [21]:
# This section will handle the - marks 

# Remove non-numeric characters from the Annual_Income column, excluding '.' 
train['Annual_Income'] = train['Annual_Income'].str.replace(r'[^0-9.]+', '', regex=True)

# Convert the Annual_Income column to numeric type
# Errors = 'coerce' makes sure that all non-convertible values are changed to NaN
train['Annual_Income'] = pd.to_numeric(train['Annual_Income'], errors='coerce')

In [22]:
# This section will handle the unrealistic ages

# Iterate over rows and update annual_income if it's not equal to either value above or below it
for index, row in train.iterrows():
    if index > 0 and index < len(train)-1:
        if row['Annual_Income'] != train.at[index-1, 'Annual_Income'] and row['Annual_Income'] != train.at[index+1, 'Annual_Income']:
            train.at[index, 'Annual_Income'] = None

# Create a dictionary mapping customer IDs to annual incomes
id_to_income = train.groupby('Customer_ID')['Annual_Income'].first().to_dict()

# Function to fill missing annual incomes based on Customer_ID
def fill_missing_income(row):
    if pd.isna(row['Annual_Income']):
        return id_to_income.get(row['Customer_ID'], None)
    else:
        return row['Annual_Income']

# Apply the function to fill missing annual incomes
train['Annual_Income'] = train.apply(fill_missing_income, axis=1)
        

In [23]:
print(train['Annual_Income'].iloc[52:63])

52    131313.40
53    131313.40
54    131313.40
55    131313.40
56     34081.38
57     34081.38
58     34081.38
59     34081.38
60     34081.38
61     34081.38
62     34081.38
Name: Annual_Income, dtype: float64


Monthly InHand Salary also has lots of missing entries (roughly 15% of the training data), but not any obvious unrealistic values. We will fill in these missing values using the same technique that we have been using for other null values. 

In [24]:
# Show the problem
print(train[['Monthly_Inhand_Salary']].head(10))

   Monthly_Inhand_Salary
0            1824.843333
1                    NaN
2                    NaN
3                    NaN
4            1824.843333
5                    NaN
6            1824.843333
7            1824.843333
8            3037.986667
9            3037.986667


In [25]:
# Create a dictionary mapping customer IDs to monthly salaries
id_to_income = train.groupby('Customer_ID')['Monthly_Inhand_Salary'].first().to_dict()

# Function to fill missing monthly salaries based on Customer_ID
def fill_missing_monthly_salary(row):
    if pd.isna(row['Monthly_Inhand_Salary']):
        return id_to_income.get(row['Customer_ID'], None)
    else:
        return row['Monthly_Inhand_Salary']

# Apply the function to fill missing monthly salaries
train['Monthly_Inhand_Salary'] = train.apply(fill_missing_monthly_salary, axis=1)

In [26]:
print(train[['Monthly_Inhand_Salary']].head(10))

   Monthly_Inhand_Salary
0            1824.843333
1            1824.843333
2            1824.843333
3            1824.843333
4            1824.843333
5            1824.843333
6            1824.843333
7            1824.843333
8            3037.986667
9            3037.986667


In [27]:
train["Monthly_Inhand_Salary"].isnull().sum()

0

Num_Bank_Accounts, Num_Credit_Card, and Interest_Rate all have some unrealistic entries, so we will fix these now just like how we fixed the Age column. 

In [28]:
# Show the problem
print(train[['Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate']].iloc[324:347])

     Num_Bank_Accounts  Num_Credit_Card  Interest_Rate
324                  5             1005             13
325                  5                5            753
326                  5                5             13
327                  5                5             13
328                  7                9             31
329                  7                9             31
330                  7                9             31
331                  7                9             31
332                  7                9             31
333                  7                9             31
334                  7                9             31
335                  7                9             31
336                  6                3             14
337                  6                3             14
338                  6                3             14
339                572                3             14
340                  6             1327             14
341       

In [29]:
# If the value is negative, change it to 0 before applying the function
train.loc[train['Num_Bank_Accounts'] < 0, 'Num_Bank_Accounts'] = 0

In [30]:
# This section will handle the unrealistic values for the Num_Bank_Accounts column

# Define the threshold for the number of bank accounts
threshold = 20

# Iterate over rows and update Num_Bank_Accounts if it's greater than the threshold
for index, row in train.iterrows():
    if row['Num_Bank_Accounts'] > threshold:
        name = row['Name']
        original_index = index
        while True:
            # Check if the value before and after it are the same
            if index > 0 and index < len(train)-1:
                prev_value = train.at[index-1, 'Num_Bank_Accounts']
                next_value = train.at[index+1, 'Num_Bank_Accounts']
                if prev_value == next_value:
                    replacement_value = prev_value
                else:
                    same_name_indices = train[train['Name'] == name].index
                    prev_index = same_name_indices[same_name_indices < index].max()
                    next_index = same_name_indices[same_name_indices > index].min()
                    if pd.notnull(prev_index):
                        replacement_value = train.at[prev_index, 'Num_Bank_Accounts']
                    elif pd.notnull(next_index):
                        replacement_value = train.at[next_index, 'Num_Bank_Accounts']
                    else:
                        # No adjacent values found with the same name
                        break
            else:
                # Cannot compare with adjacent values
                break

            # Check if replacement value exceeds threshold
            if replacement_value <= threshold:
                train.at[original_index, 'Num_Bank_Accounts'] = replacement_value
                break
            else:
                index = next_index if pd.notnull(next_index) else prev_index


In [31]:
# This section will handle the unrealistic values for the Num_Credit_Card column

# Define the threshold for the number of credit cards
threshold = 20

# Iterate over rows and update Num_Credit_Card if it's greater than the threshold
for index, row in train.iterrows():
    if row['Num_Credit_Card'] > threshold:
        name = row['Name']
        original_index = index
        while True:
            # Check if the value before and after it are the same
            if index > 0 and index < len(train)-1:
                prev_value = train.at[index-1, 'Num_Credit_Card']
                next_value = train.at[index+1, 'Num_Credit_Card']
                if prev_value == next_value:
                    replacement_value = prev_value
                else:
                    same_name_indices = train[train['Name'] == name].index
                    prev_index = same_name_indices[same_name_indices < index].max()
                    next_index = same_name_indices[same_name_indices > index].min()
                    if pd.notnull(prev_index):
                        replacement_value = train.at[prev_index, 'Num_Credit_Card']
                    elif pd.notnull(next_index):
                        replacement_value = train.at[next_index, 'Num_Credit_Card']
                    else:
                        # No adjacent values found with the same name
                        break
            else:
                # Cannot compare with adjacent values
                break

            # Check if replacement value exceeds threshold
            if replacement_value <= threshold:
                train.at[original_index, 'Num_Credit_Card'] = replacement_value
                break
            else:
                index = next_index if pd.notnull(next_index) else prev_index


In [32]:
# This section will handle the unrealistic ages for the Interest_Rate column

# Define the threshold for the interest rate
threshold = 40

# Iterate over rows and update Interest_Rate if it's greater than the threshold
for index, row in train.iterrows():
    if row['Interest_Rate'] > threshold:
        name = row['Name']
        original_index = index
        while True:
            # Check if the value before and after it are the same
            if index > 0 and index < len(train)-1:
                prev_value = train.at[index-1, 'Interest_Rate']
                next_value = train.at[index+1, 'Interest_Rate']
                if prev_value == next_value:
                    replacement_value = prev_value
                else:
                    same_name_indices = train[train['Name'] == name].index
                    prev_index = same_name_indices[same_name_indices < index].max()
                    next_index = same_name_indices[same_name_indices > index].min()
                    if pd.notnull(prev_index):
                        replacement_value = train.at[prev_index, 'Interest_Rate']
                    elif pd.notnull(next_index):
                        replacement_value = train.at[next_index, 'Interest_Rate']
                    else:
                        # No adjacent values found with the same name
                        break
            else:
                # Cannot compare with adjacent values
                break

            # Check if replacement value exceeds threshold
            if replacement_value <= threshold:
                train.at[original_index, 'Interest_Rate'] = replacement_value
                break
            else:
                index = next_index if pd.notnull(next_index) else prev_index


In [33]:
print(train[['Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate']].iloc[324:347])

     Num_Bank_Accounts  Num_Credit_Card  Interest_Rate
324                  5                5             13
325                  5                5             13
326                  5                5             13
327                  5                5             13
328                  7                9             31
329                  7                9             31
330                  7                9             31
331                  7                9             31
332                  7                9             31
333                  7                9             31
334                  7                9             31
335                  7                9             31
336                  6                3             14
337                  6                3             14
338                  6                3             14
339                  6                3             14
340                  6                3             14
341       

The Num_of_Loan column will be fixed in the same way, except we have to first remove the dash marks again. The differce here is this column has a lot of negative values (which is not possible) so it is not as simile as setting a minimum threshold this time. 

In [34]:
# Show the problem
print(train[['Num_of_Loan']].iloc[19:38])

   Num_of_Loan
19           3
20           3
21         967
22           3
23           3
24           1
25           1
26           1
27           1
28           1
29           1
30           1
31        -100
32           0
33           0
34        -100
35           0
36           0
37          0_


In [35]:
# This section will handle the - marks 

# Remove non-numeric characters from the Annual_Income column, excluding '.' 
train['Num_of_Loan'] = train['Num_of_Loan'].str.replace(r'[^0-9.]+', '', regex=True)

# Convert the Annual_Income column to numeric type
# Errors = 'coerce' makes sure that all non-convertible values are changed to NaN
train['Num_of_Loan'] = pd.to_numeric(train['Num_of_Loan'], errors='coerce')

In [36]:
# This section will handle the unrealistic values for the Num_of_Loan column

# Define the threshold for the number of loans
min_threshold = 0
max_threshold = 15

# Iterate over rows and update Num_of_Loan if it's outside the threshold
for index, row in train.iterrows():
    if row['Num_of_Loan'] < min_threshold or row['Num_of_Loan'] > max_threshold:
        name = row['Name']
        prev_index = index - 1
        next_index = index + 1
        
        # Find the previous valid value with the same name
        while prev_index >= 0 and train.at[prev_index, 'Name'] == name:
            if min_threshold <= train.at[prev_index, 'Num_of_Loan'] <= max_threshold:
                train.at[index, 'Num_of_Loan'] = train.at[prev_index, 'Num_of_Loan']
                break
            prev_index -= 1
        
        # Find the next valid value with the same name
        while next_index < len(train) and train.at[next_index, 'Name'] == name:
            if min_threshold <= train.at[next_index, 'Num_of_Loan'] <= max_threshold:
                train.at[index, 'Num_of_Loan'] = train.at[next_index, 'Num_of_Loan']
                break
            next_index += 1


In [37]:
print(train[['Num_of_Loan']].iloc[21:38])

    Num_of_Loan
21            3
22            3
23            3
24            1
25            1
26            1
27            1
28            1
29            1
30            1
31            1
32            0
33            0
34            0
35            0
36            0
37            0


All of the missing values in the Type_of_Loan column are because the previous column's value, Num_of_Loan, is 0. Instead of having null values in these spots, we will change them to say "None"

In [38]:
# Iterate over rows and update Type_of_Loan if Num_of_Loan is 0 and Type_of_Loan is null
for index, row in train.iterrows():
    if pd.isnull(row['Type_of_Loan']) and row['Num_of_Loan'] == 0:
        train.at[index, 'Type_of_Loan'] = 'None'

In [39]:
train["Type_of_Loan"].isnull().sum()

0

In [40]:
print(train[['Type_of_Loan']].iloc[22:38])

                               Type_of_Loan
22  Auto Loan, Auto Loan, and Not Specified
23  Auto Loan, Auto Loan, and Not Specified
24                            Not Specified
25                            Not Specified
26                            Not Specified
27                            Not Specified
28                            Not Specified
29                            Not Specified
30                            Not Specified
31                            Not Specified
32                                     None
33                                     None
34                                     None
35                                     None
36                                     None
37                                     None


The Delay_from_due_date column represents the number of days a payment is delayed from the due date. Given this definition, negative values are actually a good thing, indicating that the payment was made in advance, while positive values would mean the payment was made after the due date. There are no obvious adjustments that need to be made for this column. 

The Num_of_Delayed_Payment column has some unrealisitc values which we will clean up. It also has lots of missing values which are not as easy to fill in because we do not want to stretch our assumptions and risk filling our data with false values just for the sake of filling the data. To compromise, we will fill missing values if the value above and below it is the same, and if not we will drop this data entry. 

In [41]:
# Show the problem
print(train[['Num_of_Delayed_Payment']].iloc[138:157])

    Num_of_Delayed_Payment
138                     12
139                     12
140                    NaN
141                    12_
142                    NaN
143                     12
144                      8
145                      8
146                      7
147                      8
148                    NaN
149                      8
150                      8
151                      8
152                    NaN
153                     18
154                     19
155                     19
156                     19


In [42]:
print(train['Num_of_Delayed_Payment'].head(10))

0      7
1    NaN
2      7
3      4
4    NaN
5      4
6     8_
7      6
8      4
9      1
Name: Num_of_Delayed_Payment, dtype: object


In [43]:
# This section will handle the - marks 

# Remove non-numeric characters from the Annual_Income column, excluding '.' 
train['Num_of_Delayed_Payment'] = train['Num_of_Delayed_Payment'].str.replace(r'[^0-9.]+', '', regex=True)

# Convert the Annual_Income column to numeric type
# Errors = 'coerce' makes sure that all non-convertible values are changed to NaN
train['Num_of_Delayed_Payment'] = pd.to_numeric(train['Num_of_Delayed_Payment'], errors='coerce')


In [44]:
import pandas as pd
import numpy as np

def clean_delayed_payment_column(data):
    # Step 1: Change negative values to null
    data[data < 0] = np.nan
    
    # Step 2: Replace null values based on surrounding non-null values
    for i in range(len(data)):
        if pd.isnull(data[i]):
            left_index = i - 1
            right_index = i + 1
            left_value = None
            right_value = None
            
            # Find the nearest non-null values
            while left_index >= 0 or right_index < len(data):
                if left_index >= 0 and pd.notnull(data[left_index]):
                    left_value = data[left_index]
                    break
                left_index -= 1
                
            while right_index < len(data):
                if pd.notnull(data[right_index]):
                    right_value = data[right_index]
                    break
                right_index += 1
            
            # Replace null value if surrounding values are the same
            if left_value is not None and right_value is not None:
                if left_value == right_value:
                    data[i] = left_value
    
    return data

# Apply the cleaning function to the 'Num_of_Delayed_Payment' column
train['Num_of_Delayed_Payment'] = clean_delayed_payment_column(train['Num_of_Delayed_Payment'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data < 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = left_value


In [45]:
print(train['Num_of_Delayed_Payment'].head(10))

0    7.0
1    7.0
2    7.0
3    4.0
4    4.0
5    4.0
6    8.0
7    6.0
8    4.0
9    1.0
Name: Num_of_Delayed_Payment, dtype: float64


In [46]:
print(train[['Num_of_Delayed_Payment']].iloc[138:157])

     Num_of_Delayed_Payment
138                    12.0
139                    12.0
140                    12.0
141                    12.0
142                    12.0
143                    12.0
144                     8.0
145                     8.0
146                     7.0
147                     8.0
148                     8.0
149                     8.0
150                     8.0
151                     8.0
152                     NaN
153                    18.0
154                    19.0
155                    19.0
156                    19.0


In [47]:
train['Num_of_Delayed_Payment'].isnull().sum()

4650

There were initally about 7000 missing values in this column, so we were able to fill in roughly 1/3 of the values with this method that we otherwise would have dropped. However, we cannot fill the remaining null values accurately, so we will drop them at the end of the cleaning process. 

Now we will do the same thing for both the Changed_Credit_Limit, Num_Credit_Inquiries, and Credit_Mix columns. However, we will first have to remove dash marks from the Changed_Credit_Limit and Credit_Mix columns. Also, the Credit_Mix column will have to be treated as categorical rather than numeric. There are also some unrealistic values in the Num_Credit_Inquiries column that will need to be addressed first before filling null values. 

In [48]:
# Show the problem
print(train[['Changed_Credit_Limit']].iloc[242:259])

    Changed_Credit_Limit
242                 6.83
243                    _
244                 6.83
245                 6.83
246                 6.83
247                 6.83
248                    _
249                 5.07
250                 5.07
251                 5.07
252                 5.07
253                 6.07
254                    _
255                 5.07
256                 11.8
257                 11.8
258                 11.8


In [49]:
# This section will handle the - marks 

# Remove non-numeric characters from the Annual_Income column, excluding '.' 
train['Changed_Credit_Limit'] = train['Changed_Credit_Limit'].str.replace(r'[^0-9.]+', '', regex=True)

# Convert the Annual_Income column to numeric type
# Errors = 'coerce' makes sure that all non-convertible values are changed to NaN
train['Changed_Credit_Limit'] = pd.to_numeric(train['Changed_Credit_Limit'], errors='coerce')

In [50]:
# Apply the cleaning function to the 'Num_of_Delayed_Payment' column
train['Changed_Credit_Limit'] = clean_delayed_payment_column(train['Changed_Credit_Limit'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data < 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = left_value


In [51]:
print(train[['Changed_Credit_Limit']].iloc[242:259])

     Changed_Credit_Limit
242                  6.83
243                  6.83
244                  6.83
245                  6.83
246                  6.83
247                  6.83
248                   NaN
249                  5.07
250                  5.07
251                  5.07
252                  5.07
253                  6.07
254                   NaN
255                  5.07
256                 11.80
257                 11.80
258                 11.80


In [52]:
# Show the problem
print(train[['Num_Credit_Inquiries']].iloc[186:204])

     Num_Credit_Inquiries
186                   7.0
187                   NaN
188                   7.0
189                   7.0
190                   7.0
191                  11.0
192                  12.0
193                1044.0
194                  17.0
195                  17.0
196                  17.0
197                  17.0
198                1936.0
199                   NaN
200                   3.0
201                   3.0
202                   3.0
203                   3.0


In [53]:
# If the value is greater than 30, change it to null before applying the function
train.loc[train['Num_Credit_Inquiries'] > 30, 'Num_Credit_Inquiries'] = np.nan

In [54]:
# Apply the cleaning function to the 'Num_of_Delayed_Payment' column
train['Num_Credit_Inquiries'] = clean_delayed_payment_column(train['Num_Credit_Inquiries'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data < 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = left_value


In [55]:
print(train[['Num_Credit_Inquiries']].iloc[186:204])

     Num_Credit_Inquiries
186                   7.0
187                   7.0
188                   7.0
189                   7.0
190                   7.0
191                  11.0
192                  12.0
193                   NaN
194                  17.0
195                  17.0
196                  17.0
197                  17.0
198                   NaN
199                   NaN
200                   3.0
201                   3.0
202                   3.0
203                   3.0


In [56]:
# Show the problem
print(train[['Credit_Mix']].iloc[55:68])

   Credit_Mix
55       Good
56   Standard
57          _
58   Standard
59          _
60          _
61   Standard
62   Standard
63          _
64       Good
65       Good
66       Good
67       Good


In [57]:
# Need to adjust the previous method so that it does the same thing for string inputs 

def clean_credit_mix_column(data):

    # Step 1: Change "-" marks to null values
    data[data.str.startswith('_')] = np.nan
    
    # Step 2: Replace null values based on surrounding non-null values
    for i in range(1, len(data) - 1):
        if pd.isnull(data[i]):
            left_index = i - 1
            right_index = i + 1
            left_value = None
            right_value = None
            
            # Find the nearest non-null values
            while left_index >= 0 or right_index < len(data):
                if left_index >= 0 and pd.notnull(data[left_index]):
                    left_value = data[left_index]
                    break
                left_index -= 1
                
            while right_index < len(data):
                if pd.notnull(data[right_index]):
                    right_value = data[right_index]
                    break
                right_index += 1
            
            # Replace null value if surrounding values start with the same letter
            if left_value is not None and right_value is not None:
                if left_value[0] == right_value[0]:  # Check if first character matches
                    data[i] = left_value
    
    return data



# Apply the cleaning function to the 'Credit_Mix' column
train['Credit_Mix'] = clean_credit_mix_column(train['Credit_Mix'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data.str.startswith('_')] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = left_value


In [58]:
print(train[['Credit_Mix']].iloc[55:68])

   Credit_Mix
55       Good
56   Standard
57   Standard
58   Standard
59   Standard
60   Standard
61   Standard
62   Standard
63        NaN
64       Good
65       Good
66       Good
67       Good


Outstanding_Debt has dash marks that need to be removed, but no apparent unrealistic values. 

In [59]:
# Show the problem
print(train[['Outstanding_Debt']].iloc[85:95])

   Outstanding_Debt
85          1328.93
86          1328.93
87         1328.93_
88           950.36
89           950.36
90           950.36
91           950.36
92           950.36
93           950.36
94           950.36


In [60]:
# This section will handle the - marks

train['Outstanding_Debt'] = train['Outstanding_Debt'].str.replace(r'[^0-9.]+', '', regex=True)

In [61]:
print(train[['Outstanding_Debt']].iloc[85:95])

   Outstanding_Debt
85          1328.93
86          1328.93
87          1328.93
88           950.36
89           950.36
90           950.36
91           950.36
92           950.36
93           950.36
94           950.36


Nothing appears to need to be fixed with the Credit_Utilization_Ratio column. 

We will need to use indexing to turn the Credit_History_Age column into a numerical value instead of a string. 

In [62]:
# Show the problem
print(train[['Credit_History_Age']].head(11))

       Credit_History_Age
0   22 Years and 1 Months
1                     NaN
2   22 Years and 3 Months
3   22 Years and 4 Months
4   22 Years and 5 Months
5   22 Years and 6 Months
6   22 Years and 7 Months
7                     NaN
8   26 Years and 7 Months
9   26 Years and 8 Months
10  26 Years and 9 Months


In [63]:
# Function to convert the spelled out string to numeric value
def convert_to_numeric(age_string):
    # Check if the value is not NaN
    if isinstance(age_string, str):
        years, months = age_string.split(' Years and ')[0], age_string.split(' Years and ')[1].split(' Months')[0]
        return int(years) + int(months) / 12
    else:
        return None  # or any other appropriate value for NaN

# Apply the function to the 'Credit_History_Age' column
train['Credit_History_Age'] = train['Credit_History_Age'].apply(convert_to_numeric)


In [64]:
print(train[['Credit_History_Age']].head(11))

    Credit_History_Age
0            22.083333
1                  NaN
2            22.250000
3            22.333333
4            22.416667
5            22.500000
6            22.583333
7                  NaN
8            26.583333
9            26.666667
10           26.750000


In [65]:
print(train[['Credit_History_Age']].iloc[180:190])

     Credit_History_Age
180           33.083333
181                 NaN
182                 NaN
183           33.333333
184           12.250000
185           12.333333
186           12.416667
187           12.500000
188           12.583333
189           12.666667


In [66]:
def fill_zero_credit_history_age(column):
    # FIll in missing values (if possible)
    for i in range(len(column)):
        if column[i] == 0:
            left_index = i - 1
            right_index = i + 1
            left_value = None
            right_value = None
            left_diff = None
            right_diff = None
            
            # Find the nearest non-zero values
            while left_index >= 0 or right_index < len(column):
                if left_index >= 0 and column[left_index] != 0:
                    left_value = column[left_index]
                    break
                left_index -= 1
                
            while right_index < len(column):
                if column[right_index] != 0:
                    right_value = column[right_index]
                    break
                right_index += 1
            
            # Calculate the difference between adjacent non-zero values
            if left_value is not None and right_value is not None:
                left_diff = right_value - left_value
            
            # Fill in the missing value
            if left_diff is not None:
                column[i] = left_value + (i - left_index) * left_diff / (right_index - left_index)
    
    return column


In [67]:
# Replace null values with 0.0 so that we can change the column type to use the above function
train['Credit_History_Age'].fillna(0.0, inplace=True)

# Convert the 'Credit_History_Age' column to numeric
train['Credit_History_Age'] = pd.to_numeric(train['Credit_History_Age'], errors='coerce')

# Apply the function to the 'Credit_History_Age' column
train['Credit_History_Age'] = fill_zero_credit_history_age(train['Credit_History_Age'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  column[i] = left_value + (i - left_index) * left_diff / (right_index - left_index)


In [68]:
print(train[['Credit_History_Age']].head(11)) # STILL NOT FIXING THIS CORRECTLY BUT VERY CLOSE, the function below should fix it

    Credit_History_Age
0            22.083333
1            22.166667
2            22.250000
3            22.333333
4            22.416667
5            22.500000
6            22.583333
7            24.583333
8            26.583333
9            26.666667
10           26.750000


In [69]:
print(train[['Credit_History_Age']].iloc[180:190]) 

     Credit_History_Age
180           33.083333
181           33.166667
182           33.250000
183           33.333333
184           12.250000
185           12.333333
186           12.416667
187           12.500000
188           12.583333
189           12.666667


In [70]:
def replace_with_null(data):
    for i in range(1, len(data) - 1):
        
        if (data[i-1] < data[i]-(1/10)) and (data[i+1] > data[i] + (1/10)):
            data[i] = np.nan
            
    return data

In [71]:
# Apply the function to the 'Credit_History_Age' column
train['Credit_History_Age'] = replace_with_null(train['Credit_History_Age'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = np.nan


In [72]:
print(train[['Credit_History_Age']].head(11))

    Credit_History_Age
0            22.083333
1            22.166667
2            22.250000
3            22.333333
4            22.416667
5            22.500000
6            22.583333
7                  NaN
8            26.583333
9            26.666667
10           26.750000


The Payment_of_Min_Amount column takes 3 values: Yes, No, or NM (assumed to be not mentioned or not applicable). No apparent adjustments need to be made to this column. 

Total_EMI_per_month stands for "Equated Monthly Installment", which refers to the monthly amount you must pay your lender to repay a loan or debt. In order to clean this column, we will first have to remove unrealistic values, and then replace null (0) values where we can. We will set the threshold to 1000, and anything greater than this threshold will become null, and we will try to replace it. (THIS WILL REQUIRE THE SAME TECHNIQUE THAT I WANT TO USE FOR NUM_OF_DELAYED_PAYMENT)

In [73]:
# Show the problem
print(train[['Monthly_Inhand_Salary', 'Total_EMI_per_month']].iloc[30:55])

    Monthly_Inhand_Salary  Total_EMI_per_month
30            2612.490833            16.415452
31            2612.490833            16.415452
32            2853.309167             0.000000
33            2853.309167             0.000000
34            2853.309167             0.000000
35            2853.309167             0.000000
36            2853.309167             0.000000
37            2853.309167             0.000000
38            2853.309167             0.000000
39            2853.309167             0.000000
40            5988.705000         15015.000000
41            5988.705000             0.000000
42            5988.705000             0.000000
43            5988.705000             0.000000
44            5988.705000             0.000000
45            5988.705000         15515.000000
46            5988.705000             0.000000
47            5988.705000             0.000000
48           11242.783330           137.644605
49           11242.783330           137.644605
50           

In [74]:
# Assuming your DataFrame is named df and the column is 'Total_EMI_per_month'
train['Total_EMI_per_month'] = train['Total_EMI_per_month'].apply(lambda x: 0 if x > 2000 else x)

In [75]:
def clean_EMI_column(row):
    if row['Total_EMI_per_month'] == 0:
        current_salary = row['Monthly_Inhand_Salary']
        # Check the previous row
        if row.name > 0 and train.at[row.name - 1, 'Monthly_Inhand_Salary'] == current_salary:
            prev_emi = train.at[row.name - 1, 'Total_EMI_per_month']
            if prev_emi != 0:
                return prev_emi
        # Check the next row
        if row.name < len(train) - 1 and train.at[row.name + 1, 'Monthly_Inhand_Salary'] == current_salary:
            next_emi = train.at[row.name + 1, 'Total_EMI_per_month']
            if next_emi != 0:
                return next_emi
    return row['Total_EMI_per_month']

# Apply the function to the 'Total_EMI_per_month' column repeatedly until no more changes occur
while True:
    updated_column = train.apply(clean_EMI_column, axis=1)
    if updated_column.equals(train['Total_EMI_per_month']):
        break
    train['Total_EMI_per_month'] = updated_column



In [76]:
print(train[['Monthly_Inhand_Salary', 'Total_EMI_per_month']].iloc[30:55])

    Monthly_Inhand_Salary  Total_EMI_per_month
30            2612.490833            16.415452
31            2612.490833            16.415452
32            2853.309167             0.000000
33            2853.309167             0.000000
34            2853.309167             0.000000
35            2853.309167             0.000000
36            2853.309167             0.000000
37            2853.309167             0.000000
38            2853.309167             0.000000
39            2853.309167             0.000000
40            5988.705000             0.000000
41            5988.705000             0.000000
42            5988.705000             0.000000
43            5988.705000             0.000000
44            5988.705000             0.000000
45            5988.705000             0.000000
46            5988.705000             0.000000
47            5988.705000             0.000000
48           11242.783330           137.644605
49           11242.783330           137.644605
50           

If Amount_invested_monthly starts with a - or is a null value, we will remove this row at the end of the cleaning where we remove all null rows because we cannot safely estimate this value. 

In [77]:
# Show the problem
print(train[['Amount_invested_monthly']].iloc[16:26])

   Amount_invested_monthly
16             168.4137027
17             232.8603838
18               __10000__
19             825.2162699
20             430.9475279
21             257.8080994
22             263.1741632
23               __10000__
24             81.22885871
25             124.8818199


There are also some values in the Payment_Behaviour column that do not fit in and are a combination of random characters as well as some null values, so we will handle this column in a similar manner to how we handled the Amount_invested_monthly column by dropping these rows at the end of the cleaning section.

In [78]:
# Show the problem
print(train['Payment_Behaviour'].head(10))

0     High_spent_Small_value_payments
1      Low_spent_Large_value_payments
2     Low_spent_Medium_value_payments
3      Low_spent_Small_value_payments
4    High_spent_Medium_value_payments
5                              !@9#%8
6      Low_spent_Small_value_payments
7    High_spent_Medium_value_payments
8      Low_spent_Small_value_payments
9     High_spent_Large_value_payments
Name: Payment_Behaviour, dtype: object


The Monthly_Balance column has some numbers that start with a dash that needs to be dropped as well as has null values that we cannot safely fill in, so we will drop rows that are missing this data at the end as well. 

In [79]:
# This section will handle the - marks 

# Remove non-numeric characters from the Annual_Income column, excluding '.' 
train['Monthly_Balance'] = train['Monthly_Balance'].str.replace(r'[^0-9.]+', '', regex=True)


In [80]:
# Show the problem (pick a different range)
print(train[['Monthly_Balance']].iloc[312:317])

    Monthly_Balance
312     518.1908835
313     217.7960961
314             NaN
315     222.9194043
316     580.3130008


The Credit_Score columns appears to be in good shape.

In [81]:
train.head(50)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,,809.98,26.82262,22.083333,No,49.574949,80.41529544,High_spent_Small_value_payments,312.4940887,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.94496,22.166667,No,49.574949,118.2802216,Low_spent_Large_value_payments,284.6291625,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,28.609352,22.25,No,49.574949,81.69952126,Low_spent_Medium_value_payments,331.2098629,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.377862,22.333333,No,49.574949,199.4580744,Low_spent_Small_value_payments,223.4513097,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22.416667,No,49.574949,41.42015309,High_spent_Medium_value_payments,341.489231,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,27.262259,22.5,No,49.574949,62.43017233,!@9#%8,340.4792118,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,22.537593,22.583333,No,49.574949,178.3440674,Low_spent_Small_value_payments,244.5653167,Good
7,0x1609,CUS_0xd40,August,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,23.933795,,No,49.574949,24.78521651,High_spent_Medium_value_payments,358.1241676,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,Good,605.03,24.464031,26.583333,No,18.816215,104.2918252,Low_spent_Small_value_payments,470.6906269,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,Good,605.03,38.550848,26.666667,No,18.816215,40.39123783,High_spent_Large_value_payments,484.5912143,Good


Here we are dropping all the rows that have null values and resetting the index after the rows have been droppped. This will be the final training dataset. 

In [82]:
num_rows = train.shape[0]
print("Number of rows in the beginning:", num_rows)

train.dropna(subset=['Num_of_Delayed_Payment'], inplace=True)

# Change the column back to an integer type
train['Num_of_Delayed_Payment'] = train['Num_of_Delayed_Payment'].astype(int)

train['Num_of_Delayed_Payment'].isnull().sum()

train.dropna(subset=['Changed_Credit_Limit'], inplace=True)

train.dropna(subset=['Num_Credit_Inquiries'], inplace=True)

train.dropna(subset=['Credit_Mix'], inplace=True)

train.dropna(subset=['Credit_History_Age'], inplace=True)

# Drop rows where 'Total_EMI_per_month' and 'num_of_loan' are both 0
train = train.drop(train[(train['Total_EMI_per_month'] == 0) & (train['Num_of_Loan'] == 0)].index)

# Drop rows where 'Amount_invested_monthly' starts with '_' or is null
train = train[~train['Amount_invested_monthly'].astype(str).str.startswith('_') & ~train['Amount_invested_monthly'].isnull()]

# Drop rows where 'Payment_Behaviour' starts with '!' or is null
train = train[~train['Payment_Behaviour'].astype(str).str.startswith('!') & ~train['Payment_Behaviour'].isnull()]

# Drop rows with null values in 'Monthly_Balance' column
train = train.dropna(subset=['Monthly_Balance'])

# Reset index after dropping rows
train.reset_index(drop=True, inplace=True)

num_rows = train.shape[0]
print("Number of rows after dropping the nulls:", num_rows)

Number of rows in the beginning: 100000
Number of rows after dropping the nulls: 64740


In [83]:
# Display the data types of each column
print(train.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           int64
SSN                          object
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   int64
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment        int32
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [84]:
# Correct the datatypes that I had to change along the way back to their original intended types 

train['Num_Credit_Inquiries'] = train['Num_Credit_Inquiries'].astype(int)
train['Outstanding_Debt'] = train['Outstanding_Debt'].astype(float)
train['Amount_invested_monthly'] = train['Amount_invested_monthly'].astype(float)
train['Monthly_Balance'] = train['Monthly_Balance'].astype(float)

In [85]:
# Save the cleaned dataset to a new csv file
# index = false is so that a new column isnt added just for the index
train.to_csv('cleaned_train.csv', index=False)