In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('EXP4_dataset (1).csv')

# Display basic information about the DataFrame
print("DataFrame Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nData Types:\n", df.dtypes)

# Display the first few rows
print("\nFirst 5 rows:")
print(df.head())

# Display the last few rows
print("\nLast 5 rows:")
print(df.tail())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Display additional information about the DataFrame
print("\nDataFrame Info:")
df.info()

# Selecting columns
age_column = df['Age']
income_credit_columns = df[['Income (in dollars)', 'Credit Score']]

print("\nAge column:")
print(age_column.head())

print("\nIncome and Credit Score columns:")
print(income_credit_columns.head())

# Selecting rows
first_3_rows = df.iloc[:3]
high_income_rows = df[df['Income (in dollars)'] > 90000]

print("\nFirst 3 rows:")
print(first_3_rows)

print("\nHigh income rows (>$90,000):")
print(high_income_rows)

# Basic visualization
plt.figure(figsize=(10, 6))
plt.scatter(df['Age'], df['Income (in dollars)'], c=df['Loan Status'], cmap='viridis')
plt.colorbar(label='Loan Status')
plt.xlabel('Age')
plt.ylabel('Income (in dollars)')
plt.title('Age vs Income, colored by Loan Status')
plt.savefig('age_vs_income_scatter.png')
plt.close()

print("\nScatter plot of Age vs Income, colored by Loan Status, has been saved as 'age_vs_income_scatter.png'")

DataFrame Shape: (25, 4)

Columns: Index(['Age', 'Income (in dollars)', 'Credit Score', 'Loan Status'], dtype='object')

Data Types:
 Age                    int64
Income (in dollars)    int64
Credit Score           int64
Loan Status            int64
dtype: object

First 5 rows:
   Age  Income (in dollars)  Credit Score  Loan Status
0   25                50000           650            1
1   40                80000           720            1
2   30                60000           680            0
3   50               100000           740            1
4   35                70000           700            1

Last 5 rows:
    Age  Income (in dollars)  Credit Score  Loan Status
20   34                68000           695            0
21   39                77000           715            1
22   58               115000           775            1
23   27                54000           665            0
24   41                82000           725            1

Summary Statistics:
             Age  In

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('EXP4_dataset (1).csv')

# 1. Identify missing values
print("Missing values:")
print(df.isnull().sum())

# Let's artificially introduce some missing values for demonstration purposes
df.loc[2, 'Income (in dollars)'] = np.nan
df.loc[5, 'Credit Score'] = np.nan

print("\nAfter introducing missing values:")
print(df.isnull().sum())

# 2. Handle missing values
# Fill missing values in 'Income' with mean
df['Income (in dollars)'].fillna(df['Income (in dollars)'].mean(), inplace=True)

# Interpolate missing values in 'Credit Score'
df['Credit Score'] = df['Credit Score'].interpolate()

print("\nAfter handling missing values:")
print(df.isnull().sum())

# 3. Apply scaling techniques
# Min-Max scaling for 'Age'
min_max_scaler = MinMaxScaler()
df['Age_MinMax_Scaled'] = min_max_scaler.fit_transform(df[['Age']])

# Z-score scaling for 'Income'
standard_scaler = StandardScaler()
df['Income_Z_Scaled'] = standard_scaler.fit_transform(df[['Income (in dollars)']])

# 4. Create dummy variables for categorical columns
# In this dataset, 'Loan Status' can be considered categorical
df_with_dummies = pd.get_dummies(df, columns=['Loan Status'], prefix='Loan')

# Display the first few rows of the processed DataFrame
print("\nFirst few rows of the processed DataFrame:")
print(df_with_dummies.head())

# Visualize the effect of scaling
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(df['Age'], df['Income (in dollars)'])
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('Original Data')

plt.subplot(1, 2, 2)
plt.scatter(df['Age_MinMax_Scaled'], df['Income_Z_Scaled'])
plt.xlabel('Age (Min-Max Scaled)')
plt.ylabel('Income (Z-Score Scaled)')
plt.title('Scaled Data')

plt.tight_layout()
plt.savefig('scaling_comparison.png')
plt.close()

print("\nScaling comparison plot has been saved as 'scaling_comparison.png'")

# Summary statistics of the processed DataFrame
print("\nSummary statistics of the processed DataFrame:")
print(df_with_dummies.describe())

# Save the processed DataFrame to a new CSV file
df_with_dummies.to_csv('processed_dataset.csv', index=False)
print("\nProcessed dataset has been saved as 'processed_dataset.csv'")
# Last edited just now




Missing values:
Age                    0
Income (in dollars)    0
Credit Score           0
Loan Status            0
dtype: int64

After introducing missing values:
Age                    0
Income (in dollars)    1
Credit Score           1
Loan Status            0
dtype: int64

After handling missing values:
Age                    0
Income (in dollars)    0
Credit Score           0
Loan Status            0
dtype: int64

First few rows of the processed DataFrame:
   Age  Income (in dollars)  Credit Score  Age_MinMax_Scaled  Income_Z_Scaled  \
0   25              50000.0         650.0           0.000000        -1.540716   
1   40              80000.0         720.0           0.428571         0.032781   
2   30              79375.0         680.0           0.142857         0.000000   
3   50             100000.0         740.0           0.714286         1.081780   
4   35              70000.0         700.0           0.285714        -0.491718   

   Loan_0  Loan_1  
0       0       1  
1      

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('EXP4_dataset (1).csv')

# 1. Aggregation and Grouping
print("1. Aggregation and Grouping")

# Calculate summary statistics using groupby
loan_status_summary = df.groupby('Loan Status').agg({
    'Age': ['mean', 'median', 'count'],
    'Income (in dollars)': ['mean', 'median', 'min', 'max'],
    'Credit Score': ['mean', 'median', 'min', 'max']
})

print("\nSummary statistics by Loan Status:")
print(loan_status_summary)

# Calculate average income by age group
df['Age Group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 60, np.inf], labels=['18-30', '31-40', '41-50', '51-60', '60+'])
avg_income_by_age = df.groupby('Age Group')['Income (in dollars)'].mean().sort_values(ascending=False)

print("\nAverage Income by Age Group:")
print(avg_income_by_age)

# Visualize average income by age group
plt.figure(figsize=(10, 6))
avg_income_by_age.plot(kind='bar')
plt.title('Average Income by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Income (in dollars)')
plt.tight_layout()
plt.savefig('avg_income_by_age_group.png')
plt.close()

print("\nBar plot of Average Income by Age Group has been saved as 'avg_income_by_age_group.png'")

# 2. Pivot Tables
print("\n2. Pivot Tables")

# Create a pivot table of average credit score by age group and loan status
pivot_table = pd.pivot_table(df, values='Credit Score', index='Age Group', columns='Loan Status', aggfunc='mean')
print("\nPivot Table - Average Credit Score by Age Group and Loan Status:")
print(pivot_table)

# Visualize the pivot table
plt.figure(figsize=(10, 6))
pivot_table.plot(kind='bar', stacked=True)
plt.title('Average Credit Score by Age Group and Loan Status')
plt.xlabel('Age Group')
plt.ylabel('Average Credit Score')
plt.legend(title='Loan Status', loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig('credit_score_by_age_loan_status.png')
plt.close()

print("\nStacked bar plot of Credit Score by Age Group and Loan Status has been saved as 'credit_score_by_age_loan_status.png'")

# 3. Combining DataFrames
print("\n3. Combining DataFrames")

# Split the original DataFrame into two for demonstration purposes
df1 = df.iloc[:12].copy()  # First 12 rows
df2 = df.iloc[12:].copy()  # Remaining rows

# Add a new column to df2 to demonstrate merging
df2['Risk Level'] = pd.cut(df2['Credit Score'], bins=[0, 650, 700, 750, np.inf], labels=['High', 'Medium', 'Low', 'Very Low'])

# Concatenate DataFrames
df_concat = pd.concat([df1, df2], axis=0, ignore_index=True)
print("\nShape after concatenation:", df_concat.shape)

# Merge DataFrames
df_merge = pd.merge(df1, df2[['Age', 'Risk Level']], on='Age', how='left')
print("\nShape after merging:", df_merge.shape)

# Join DataFrames
df_join = df1.set_index('Age').join(df2.set_index('Age')[['Risk Level']], how='outer')
print("\nShape after joining:", df_join.shape)

# Display the first few rows of the joined DataFrame
print("\nFirst few rows of the joined DataFrame:")
print(df_join.head())

# Save the final DataFrame to a new CSV file
df_join.to_csv('final_processed_dataset.csv')
print("\nFinal processed dataset has been saved as 'final_processed_dataset.csv'")
 # Last edited just now




1. Aggregation and Grouping

Summary statistics by Loan Status:
                   Age              Income (in dollars)                  \
                  mean median count                mean   median    min   
Loan Status                                                               
0            29.750000   29.5     8        59375.000000  59000.0  52000   
1            44.411765   43.0    17        87647.058824  87000.0  50000   

                    Credit Score                   
                max         mean median  min  max  
Loan Status                                        
0             68000   677.500000  677.5  660  695  
1            120000   733.823529  735.0  650  780  

Average Income by Age Group:
Age Group
51-60    110000.000000
41-50     89428.571429
31-40     71250.000000
18-30     54833.333333
60+                NaN
Name: Income (in dollars), dtype: float64

Bar plot of Average Income by Age Group has been saved as 'avg_income_by_age_group.png'

2. Pivot Tabl

<Figure size 1000x600 with 0 Axes>