In [1]:
from __future__ import print_function, division

In [21]:
import pandas as pd

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.pipeline import Pipeline
from sklearn import decomposition

from sklearn.model_selection import train_test_split, cross_val_score

DATA_PATH = './data/'
# Pipelining: This chains a PCA and logistic regression, and uses the UCI
# Census Adult dataset.

df = pd.read_csv(DATA_PATH + 'adult.data.csv')
new_df = pd.DataFrame()
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
              'marital_status', 'occupation', 'relationship', 'race', 'sex',
              'capital_gain', 'capital_loss', 'hours_per_week',
              'native_country', 'income_level']

le = preprocessing.LabelEncoder()

# Assigning 0.0 to represent incomes <=50K, and 1.0 to represent incomes >50K
df['income_level'] = df['income_level'].str.strip()
df['income_level'] = df['income_level'].replace(['<=50K'], [0.0])
df['income_level'] = df['income_level'].replace(['>50K'], [1.0])

# calling labelEncoder on any columns that are object types
for coltype, colname in zip(df.dtypes, df.columns):
    if coltype == 'object':
        le.fit(df[colname])
        transformed_vals = le.transform(df[colname])
        new_df[colname + "_index"] = transformed_vals
    else:
        new_df[colname] = df[colname]

# Creating the pipeline
pca = decomposition.PCA()
lr = linear_model.LogisticRegression(max_iter=250)
pipe = Pipeline(steps=[('pca', pca), ('logistic', lr)])

In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    new_df, new_df['income_level'], test_size=0.3, random_state=0)

In [14]:
partial_training = x_train[x_train.columns[:-1]]
partial_testing = x_test[x_test.columns[:-1]]

In [15]:
partial_training.shape

(22792, 14)

In [16]:
y_train.shape

(22792,)

In [20]:
partial_training[:100]

Unnamed: 0,age,workclass_index,fnlwgt,education_index,education_num,marital_status_index,occupation_index,relationship_index,race_index,sex_index,capital_gain,capital_loss,hours_per_week,native_country_index
20721,32,5,343872,15,10,2,14,0,2,1,0,0,35,14
32097,45,4,170871,11,9,2,3,0,4,1,7298,0,60,39
25205,47,7,108890,11,9,0,1,4,4,0,1831,0,38,39
23491,37,4,61778,9,13,4,5,1,4,1,0,0,30,39
12367,24,4,108495,15,10,4,1,3,4,1,0,0,40,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26276,44,6,127482,11,9,2,3,0,4,1,7298,0,50,9
8017,30,4,177522,11,9,2,1,3,4,0,0,0,20,39
1640,42,4,285066,9,13,2,10,0,4,1,0,0,45,39
17495,43,4,117728,11,9,2,3,0,4,1,0,0,44,39


In [25]:
pipe.fit(partial_training[:1000], y_train[:1000])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logistic',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=250,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [2]:
# Separating dataset into training and testing sets
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    new_df, new_df['income_level'], test_size=0.3, random_state=0)

# We don't want to include our label (income_level) when fitting
partial_training = x_train[x_train.columns[:-1]]
partial_testing = x_test[x_test.columns[:-1]]

partial_training = partial_training[:-20]

# Fit the pipeline
pipe.fit_sync(partial_training, y_train)

y_pred = pipe.predict(partial_testing)
# Compute various metrics on the testing set
# SyncableMetrics.compute_metrics(
#     pipe, f1_score, y_test, y_pred, partial_testing, "predictionCol",
#     'income_level')
# SyncableMetrics.compute_metrics(
#     pipe, precision_score, y_test, y_pred, partial_testing, "predictionCol",
#     'income_level')

# syncer_obj.sync()

AttributeError: module 'pandas' has no attribute 'read_csv_sync'