In [1]:
# importing the dataset
import pandas
import numpy
from sklearn import preprocessing
import flask
from flask import Flask

df = pandas.read_csv('adult.csv')
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### EDA

In [2]:
df.shape

(32561, 15)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   workclass        32561 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   educational-num  32561 non-null  int64 
 5   marital-status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   gender           32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   32561 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Preprocessing the dataset 
We replaced the missing values with the mode value in that column

In [2]:
df = df.drop(['fnlwgt', 'educational-num'], axis = 1)

col_names = df.columns

for c in col_names:
    df = df.replace("?", numpy.NaN)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))


Discretization – It is a common way to make categorical data more tidy and meaningful. We have applied discretization on column marital_status where they are narrowed down to only to values married or not married. Later, we will apply label encoder in the remaining data columns. Also, there are two redundant columns {‘education’, ‘educational-num’}, therefore, we have removed one of them.

In [4]:
df.replace(['Divorced', 'Married-AF-spouse','Married-civ-spouse', 'Married-spouse-absent','Never-married', 'Separated', 'Widowed'],
           ['divorced', 'married', 'married', 'married','not married', 'not married', 'not married'], inplace = True)

category_col =['workclass', 'race', 'education', 'marital-status', 'occupation',
            'relationship', 'gender', 'native-country', 'income']
labelEncoder = preprocessing.LabelEncoder()

mapping_dict ={}
for col in category_col:
   df[col] = labelEncoder.fit_transform(df[col])

   le_name_mapping = dict(zip(labelEncoder.classes_,
   labelEncoder.transform(labelEncoder.classes_)))

   mapping_dict[col]= le_name_mapping
print(mapping_dict)


{'workclass': {' ?': 0, ' Federal-gov': 1, ' Local-gov': 2, ' Never-worked': 3, ' Private': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' State-gov': 7, ' Without-pay': 8}, 'race': {' Amer-Indian-Eskimo': 0, ' Asian-Pac-Islander': 1, ' Black': 2, ' Other': 3, ' White': 4}, 'education': {' 10th': 0, ' 11th': 1, ' 12th': 2, ' 1st-4th': 3, ' 5th-6th': 4, ' 7th-8th': 5, ' 9th': 6, ' Assoc-acdm': 7, ' Assoc-voc': 8, ' Bachelors': 9, ' Doctorate': 10, ' HS-grad': 11, ' Masters': 12, ' Preschool': 13, ' Prof-school': 14, ' Some-college': 15}, 'marital-status': {' Divorced': 0, ' Married-AF-spouse': 1, ' Married-civ-spouse': 2, ' Married-spouse-absent': 3, ' Never-married': 4, ' Separated': 5, ' Widowed': 6}, 'occupation': {' ?': 0, ' Adm-clerical': 1, ' Armed-Forces': 2, ' Craft-repair': 3, ' Exec-managerial': 4, ' Farming-fishing': 5, ' Handlers-cleaners': 6, ' Machine-op-inspct': 7, ' Other-service': 8, ' Priv-house-serv': 9, ' Prof-specialty': 10, ' Protective-serv': 11, ' Sales': 12, '

## Fitting the model –
After pre-processing the data, split the dataset into two halves, one for training and one for testing. This is achieved using train_test_split() function of sklearn.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = df.values[:, 0:12]
Y = df.values[:, 12]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

dt_clf_gini = DecisionTreeClassifier(criterion = "gini",
                                     random_state = 100,
                                     max_depth = 5,
                                     min_samples_leaf = 5)

dt_clf_gini.fit(X_train, y_train)
y_pred_gini = dt_clf_gini.predict(X_test)

print ("Desicion Tree using Gini Index\nAccuracy is ",
accuracy_score(y_test, y_pred_gini)*100 )


Desicion Tree using Gini Index
Accuracy is  46.831814924762


##### <html>
<body>
	<h3>Income Prediction Form</h3>

<div>
<form action="/result" method="POST">
	<label for="age">Age</label>
	<input type="text" id="age" name="age">
	<br>
	<label for="w_class">Working Class</label>
	<select id="w_class" name="w_class">
	<option value="0">Federal-gov</option>
	<option value="1">Local-gov</option>
	<option value="2">Never-worked</option>
	<option value="3">Private</option>
	<option value="4">Self-emp-inc</option>
	<option value="5">Self-emp-not-inc</option>
	<option value="6">State-gov</option>
	<option value="7">Without-pay</option>
	</select>
	<br>
	<label for="edu">Education</label>
	<select id="edu" name="edu">
	<option value="0">10th</option>
	<option value="1">11th</option>
	<option value="2">12th</option>
	<option value="3">1st-4th</option>
	<option value="4">5th-6th</option>
	<option value="5">7th-8th</option>
	<option value="6">9th</option>
	<option value="7">Assoc-acdm</option>
	<option value="8">Assoc-voc</option>
	<option value="9">Bachelors</option>
	<option value="10">Doctorate</option>
	<option value="11">HS-grad</option>
	<option value="12">Masters</option>
	<option value="13">Preschool</option>
	<option value="14">Prof-school</option>
	<option value="15">16 - Some-college</option>
	</select>
	<br>
	<label for="martial_stat">Marital Status</label>
	<select id="martial_stat" name="martial_stat">
	<option value="0">divorced</option>
	<option value="1">married</option>
	<option value="2">not married</option>
	</select>
	<br>
	<label for="occup">Occupation</label>
	<select id="occup" name="occup">
	<option value="0">Adm-clerical</option>
	<option value="1">Armed-Forces</option>
	<option value="2">Craft-repair</option>
	<option value="3">Exec-managerial</option>
	<option value="4">Farming-fishing</option>
	<option value="5">Handlers-cleaners</option>
	<option value="6">Machine-op-inspct</option>
	<option value="7">Other-service</option>
	<option value="8">Priv-house-serv</option>
	<option value="9">Prof-specialty</option>
	<option value="10">Protective-serv</option>
	<option value="11">Sales</option>
	<option value="12">Tech-support</option>
	<option value="13">Transport-moving</option>
	</select>
	<br>
	<label for="relation">Relationship</label>
	<select id="relation" name="relation">
	<option value="0">Husband</option>
	<option value="1">Not-in-family</option>
	<option value="2">Other-relative</option>
	<option value="3">Own-child</option>
	<option value="4">Unmarried</option>
	<option value="5">Wife</option>
	</select>
	<br>
	<label for="race">Race</label>
	<select id="race" name="race">
	<option value="0">Amer Indian Eskimo</option>
	<option value="1">Asian Pac Islander</option>
	<option value="2">Black</option>
	<option value="3">Other</option>
	<option value="4">White</option>
	</select>
	<br>
	<label for="gender">Gender</label>
	<select id="gender" name="gender">
	<option value="0">Female</option>
	<option value="1">Male</option>
	</select>
	<br>
	<label for="c_gain">Capital Gain </label>
	<input type="text" id="c_gain" name="c_gain">btw:[0-99999]
	<br>
	<label for="c_loss">Capital Loss </label>
	<input type="text" id="c_loss" name="c_loss">btw:[0-4356]
	<br>
	<label for="hours_per_week">Hours per Week </label>
	<input type="text" id="hours_per_week" name="hours_per_week">btw:[1-99]
	<br>
	<label for="native-country">Native Country</label>
	<select id="native-country" name="native-country">
	<option value="0">Cambodia</option>
	<option value="1">Canada</option>
	<option value="2">China</option>
	<option value="3">Columbia</option>
	<option value="4">Cuba</option>
	<option value="5">Dominican Republic</option>
	<option value="6">Ecuador</option>
	<option value="7">El Salvadorr</option>
	<option value="8">England</option>
	<option value="9">France</option>
	<option value="10">Germany</option>
	<option value="11">Greece</option>
	<option value="12">Guatemala</option>
	<option value="13">Haiti</option>
	<option value="14">Netherlands</option>
	<option value="15">Honduras</option>
	<option value="16">HongKong</option>
	<option value="17">Hungary</option>
	<option value="18">India</option>
	<option value="19">Iran</option>
	<option value="20">Ireland</option>
	<option value="21">Italy</option>
	<option value="22">Jamaica</option>
	<option value="23">Japan</option>
	<option value="24">Laos</option>
	<option value="25">Mexico</option>
	<option value="26">Nicaragua</option>
	<option value="27">Outlying-US(Guam-USVI-etc)</option>
	<option value="28">Peru</option>
	<option value="29">Philippines</option>
	<option value="30">Poland</option>
	<option value="11">Portugal</option>
	<option value="32">Puerto-Rico</option>
	<option value="33">Scotland</option>
	<option value="34">South</option>
	<option value="35">Taiwan</option>
	<option value="36">Thailand</option>
	<option value="37">Trinadad&Tobago</option>
	<option value="38">United States</option>
	<option value="39">Vietnam</option>
	<option value="40">Yugoslavia</option>
	</select>
	<br>
	<input type="submit" value="Submit">
</form>
</div>
</body>
</html>


#### Below is simple code HTML form
https://www.tutorialrepublic.com/html-tutorial/html-forms.php

## Simple HTML form
<!DOCTYPE html>
<html lang="en">
<head>
    <title>Simple HTML Form</title>
</head>
<body>
    <form action="/examples/actions/confirmation.php" method="post">
        <label>Username: <input type="text" name="username"></label>
        <label>Password: <input type="password" name="userpass"></label>
        <input type="submit" value="Submit">
    </form>
</body>
</html>

In [7]:
from flask import Flask
#from flask.ext.sqlalchemy import SQLAlchemy
import os

app = Flask(__name__)
@app.route('/')
render_template('index.html')

SyntaxError: invalid syntax (<ipython-input-7-54dfa3e03fd8>, line 7)

In [16]:
# prediction function
def ValuePredictor(to_predict_list):
    to_predict = np.array(to_predict_list).reshape(1, 12)
    loaded_model = pickle.load(open("model.pkl", "rb"))
    result = loaded_model.predict(to_predict)
    return result[0]

@app.route('/result', methods = ['POST'])
def result():
    if request.method == 'POST':
        to_predict_list = request.form.to_dict()
        to_predict_list = list(to_predict_list.values())
        to_predict_list = list(map(int, to_predict_list))
        result = ValuePredictor(to_predict_list)
    if int(result)== 1:
        prediction ='Income more than 50K'
    else:
        prediction ='Income less that 50K'
    return render_template("result.html", prediction = prediction)


NameError: name 'app' is not defined