## Multi-Class Modeling

In [1]:
# import packages needed for data handling 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
sns.set(style="darkgrid")

# import packages to split the data 
from sklearn import model_selection
from sklearn.model_selection import train_test_split

# import models 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import SVC
import xgboost as xgb

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [2]:
# read in modeling data file
cols = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'Y1', 'Y2']

df = pd.read_csv('data/modeling.csv', header=0, names=cols)
df.head(2)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f70,f71,f72,f73,f74,f75,f76,f77,Y1,Y2
0,-0.01821,-0.010433,-0.018399,-0.018279,-2.896385,-0.024231,-0.02066,4.079933,-1.414801,-3.011022,...,0,0,0,0,0,0,0,9.0,0,0
1,-0.01821,-3.1822,-3.260786,-3.270119,-2.037297,-0.024231,-0.02066,3.366161,-3.683655,-3.011022,...,0,0,0,0,0,0,0,9.0,0,0


In [3]:
# make a dataframe just for the targets 
dfy = df.filter(['Y1','Y2'], axis=1)
dfy.head(2)

Unnamed: 0,Y1,Y2
0,0,0
1,0,0


In [4]:
df=df.drop(['Y1','Y2'],axis=1)
df.head(2)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f68,f69,f70,f71,f72,f73,f74,f75,f76,f77
0,-0.01821,-0.010433,-0.018399,-0.018279,-2.896385,-0.024231,-0.02066,4.079933,-1.414801,-3.011022,...,0,0,0,0,0,0,0,0,0,9.0
1,-0.01821,-3.1822,-3.260786,-3.270119,-2.037297,-0.024231,-0.02066,3.366161,-3.683655,-3.011022,...,0,0,0,0,0,0,0,0,0,9.0


In [5]:
# add a new column 
# this will be the column that will contain all 3 classes for modeling 
dfy['Y'] = np.nan
dfy.head(2)

Unnamed: 0,Y1,Y2,Y
0,0,0,
1,0,0,


In [8]:
y = []

for i in dfy.columns:
    if dfy.loc[(dfy['Y1']==1) & (dfy['Y2']==1)]:
        y.append(1)
    elif dfy.loc[(dfy['Y1']==1) & (dfy['Y2']==0)]:
        y.append(0)
    else:
        y.append(2)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [20]:
test = dfy.loc[:, 'Y1'] == 1 
test

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
52756     True
52757     True
52758     True
52759    False
52760     True
52761    False
52762     True
52763    False
52764     True
52765     True
52766    False
52767     True
52768     True
52769    False
52770     True
52771     True
52772     True
52773     True
52774     True
52775    False
52776     True
52777     True
52778     True
52779     True
52780     True
52781     True
52782     True
52783     True
52784    False
52785    False
Name: Y1, Length: 52786, dtype: bool

In [23]:
#y = []

if dfy.loc[:, 'Y1'] == 1:
        y.append(1)
    
# if dfy.loc[:,'Y2'] == 1:
        
    #else:
  #      df['Y']==0
#else:
  #  df['Y']==2

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [14]:
# Splitting up our data into features and targets

X = df.iloc[:, :-1] # Features
Y = df.Y # Target 1

# Split dataset into training set and test set using a 70/30 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

# import the RandomOverSampler package from imblearn
# this package will help address the imbalanced nature of the targets 
from imblearn.over_sampling import RandomOverSampler

# define the RandomOverSampler (ros) model
ros = RandomOverSampler(random_state=2019)

# fit the training data only to the RandomOverSampler model
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)

TypeError: fit_resample() takes 3 positional arguments but 4 were given