# predicting Income for subsidy Using Random Forest algorithm

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data = pd.read_csv('income(1) (2).csv', na_values=[" ?"]) 

In [3]:
# Data pre-processing
# =============================================================================

data.isnull().sum()

missing = data[data.isnull().any(axis=1)]
# axis=1 => to consider at least one column value is missing in a row

In [4]:
data2 = data.dropna(axis=0)

In [5]:

# Reindexing the salary status names to 0,1
data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})
print(data2['SalStat'])


0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000': 0, ' greater than 50,000': 1})


In [6]:
new_data = pd.get_dummies(data2, drop_first=True)

In [7]:
# Storing the column names 
columns_list = list(new_data.columns)
print(columns_list)

['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-s

In [8]:
# Separating the input names from data
features = list(set(columns_list) - set(['SalStat']))
print(features)

['nativecountry_ Japan', 'EdType_ Some-college', 'maritalstatus_ Married-civ-spouse', 'occupation_ Prof-specialty', 'maritalstatus_ Separated', 'maritalstatus_ Married-spouse-absent', 'nativecountry_ Germany', 'EdType_ Assoc-voc', 'nativecountry_ Holand-Netherlands', 'JobType_ Private', 'EdType_ Doctorate', 'occupation_ Armed-Forces', 'EdType_ 11th', 'race_ White', 'EdType_ Masters', 'relationship_ Wife', 'age', 'nativecountry_ Mexico', 'nativecountry_ Guatemala', 'nativecountry_ Italy', 'JobType_ Without-pay', 'nativecountry_ El-Salvador', 'capitalloss', 'nativecountry_ United-States', 'nativecountry_ Greece', 'relationship_ Other-relative', 'nativecountry_ Dominican-Republic', 'nativecountry_ China', 'nativecountry_ Thailand', 'gender_ Male', 'race_ Black', 'nativecountry_ Canada', 'occupation_ Sales', 'nativecountry_ Haiti', 'nativecountry_ Vietnam', 'nativecountry_ Yugoslavia', 'EdType_ Bachelors', 'occupation_ Machine-op-inspct', 'nativecountry_ Trinadad&Tobago', 'capitalgain', 'n

In [9]:
# Storing the output values in y
y = new_data['SalStat'].values
print(y)


[0 0 1 ... 0 0 0]


In [10]:
# Storing the values from input features
x = new_data[features].values
print(x)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Make sure x and y are correctly defined with data

# Initialize a Random Forest model
model = RandomForestClassifier()

# Initialize an empty list to store accuracy scores
accuracy_scores = []

# Perform Random Forest classification for 15 iterations
for i in range(15):
    # Split the data into training and testing sets (adjust sizes as needed)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=i)
    
    # Train the model on the training data
    model.fit(train_x, train_y)
    
    # Make predictions on the test data
    predictions = model.predict(test_x)
    
    # Calculate accuracy and append to the list
    accuracy = accuracy_score(test_y, predictions)
    accuracy_scores.append(accuracy)

# Print the accuracy scores for each iteration
for i, score in enumerate(accuracy_scores):
    print(f'Iteration {i + 1} - Accuracy: {score:.2%}')


Iteration 1 - Accuracy: 84.25%
Iteration 2 - Accuracy: 84.69%
Iteration 3 - Accuracy: 84.61%
Iteration 4 - Accuracy: 84.27%
Iteration 5 - Accuracy: 84.03%
Iteration 6 - Accuracy: 84.44%
Iteration 7 - Accuracy: 84.04%
Iteration 8 - Accuracy: 84.52%
Iteration 9 - Accuracy: 83.49%
Iteration 10 - Accuracy: 84.50%
Iteration 11 - Accuracy: 84.72%
Iteration 12 - Accuracy: 84.10%
Iteration 13 - Accuracy: 84.11%
Iteration 14 - Accuracy: 84.36%
Iteration 15 - Accuracy: 84.50%


In [12]:
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Make sure x and y are correctly defined with data

# Initialize empty lists to store accuracy scores and losses
accuracy_scores = []
losses = []  # Define a list to store your custom loss values

# Perform Random Forest classification for 20 iterations
for i in range(20):
    # Split the data into training and testing sets (adjust sizes as needed)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=i)

    # Initialize a Random Forest classifier with a specified number of estimators
    n_estimators = 100  # You can adjust the number of estimators as needed
    rf_model = RandomForestClassifier(n_estimators=n_estimators)

    # Train the model on the training data
    rf_model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = rf_model.predict(X_test)

    # Calculate accuracy and append to the list
    accuracy = accuracy_score(y_test, predictions)
    accuracy_scores.append(accuracy)

    # You need to define your custom loss function and calculate loss
    # Here's an example using mean squared error as a loss function
    # Replace this with your specific loss function
    loss = ((y_test - predictions) ** 2).mean()
    losses.append(loss)

# Print the accuracy scores and losses for each iteration
for i, (score, loss) in enumerate(zip(accuracy_scores, losses)):
    print(f'Iteration {i + 1} - Accuracy: {score:.2%} - Loss: {loss:.4f}')


Iteration 1 - Accuracy: 84.19% - Loss: 0.1581
Iteration 2 - Accuracy: 84.61% - Loss: 0.1539
Iteration 3 - Accuracy: 84.63% - Loss: 0.1537
Iteration 4 - Accuracy: 84.12% - Loss: 0.1588
Iteration 5 - Accuracy: 84.11% - Loss: 0.1589
Iteration 6 - Accuracy: 84.24% - Loss: 0.1576
Iteration 7 - Accuracy: 84.11% - Loss: 0.1589
Iteration 8 - Accuracy: 84.46% - Loss: 0.1554
Iteration 9 - Accuracy: 83.25% - Loss: 0.1675
Iteration 10 - Accuracy: 84.39% - Loss: 0.1561
Iteration 11 - Accuracy: 84.55% - Loss: 0.1545
Iteration 12 - Accuracy: 84.12% - Loss: 0.1588
Iteration 13 - Accuracy: 83.98% - Loss: 0.1602
Iteration 14 - Accuracy: 84.21% - Loss: 0.1579
Iteration 15 - Accuracy: 84.43% - Loss: 0.1557
Iteration 16 - Accuracy: 84.63% - Loss: 0.1537
Iteration 17 - Accuracy: 84.51% - Loss: 0.1549
Iteration 18 - Accuracy: 84.31% - Loss: 0.1569
Iteration 19 - Accuracy: 84.41% - Loss: 0.1559
Iteration 20 - Accuracy: 84.34% - Loss: 0.1566


In [13]:
# Splitting the data into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)


In [14]:
# Random Forest
# =============================================================================

# Storing the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)  

# Fitting the values for X and Y
rf_classifier.fit(train_x, train_y) 
1

1

In [14]:
# Predicting the test values with the model
prediction = rf_classifier.predict(test_x)

# Performance metric check
confusion_matrix_rf = confusion_matrix(test_y, prediction)
print(confusion_matrix_rf)

[[6231  592]
 [ 837 1389]]


In [15]:
# Calculating the accuracy
accuracy_score_rf = accuracy_score(test_y, prediction)
print(accuracy_score_rf)

0.84208199801083


In [16]:
print('Misclassified samples: %d' % (test_y != prediction).sum())

Misclassified samples: 1429


In [17]:
# Storing the column names 
columns_list = list(new_data.columns)
print(columns_list)

# Separating the input names from data
features = list(set(columns_list) - set(['SalStat']))
print(features)

# Storing the output values in y
y = new_data['SalStat'].values
print(y)

# Storing the values from input features
x = new_data[features].values
print(x)

# Splitting the data into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=0)

# =============================================================================
# Random Forest
# =============================================================================



['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-s

In [18]:
# Storing the Random Forest Classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=0)  

# Fitting the values for X and Y
rf_classifier.fit(train_x, train_y) 



RandomForestClassifier(max_depth=20, n_estimators=300, random_state=0)

In [19]:
# Predicting the test values with the model
prediction = rf_classifier.predict(test_x)

# Performance metric check
confusion_matrix_rf = confusion_matrix(test_y, prediction)
print(confusion_matrix_rf)

# Calculating the accuracy
accuracy_score_rf = accuracy_score(test_y, prediction)
print("random forest algorithm")
print("Accuracy Score:", accuracy_score_rf)

print('Misclassified samples: %d' % (test_y != prediction).sum())

[[4282  248]
 [ 636  867]]
random forest algorithm
Accuracy Score: 0.8534725675451682
Misclassified samples: 884


In [20]:
# Storing the Random Forest Classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=20, min_samples_split=5, min_samples_leaf=2, random_state=0)  

# Fitting the values for X and Y
rf_classifier.fit(train_x, train_y) 

# Predicting the test values with the model
prediction = rf_classifier.predict(test_x)

# Performance metric check
confusion_matrix_rf = confusion_matrix(test_y, prediction)
print(confusion_matrix_rf)

# Calculating the accuracy
accuracy_score_rf = accuracy_score(test_y, prediction)
print("Accuracy Score:", accuracy_score_rf)

print('Misclassified samples: %d' % (test_y != prediction).sum())

[[4285  245]
 [ 641  862]]
Accuracy Score: 0.8531410575169899
Misclassified samples: 886
