# MACHINE LEARNING: CLASSIFICATION

## IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as py
import matplotlib.pyplot as plt
import seaborn as sns
import os

## IMPORT DATA FOR CLASSIFICATION

In [2]:
# function to download data
def DownloadData(url=''):
    if url=='': return
    data = pd.DataFrame()
    try:
        data = pd.read_csv(url)
        print('Data downloaded!')
    except OSError:
        print('Network Connection Error!') 
    return data
# ---


# function to import data
def importData(file='',url=''):
    if url == '' and file == '': return
    data = pd.DataFrame()
    try:
        data = pd.read_excel(file)
        if not data.empty:
            print('File loaded!')
        else: 
            data = DownloadData(url)
    except FileNotFoundError:
        data = DownloadData(url)
    finally:
        if not data.empty:
            if data.to_excel(file,index=False):
                print('File Saved!')
        else: print('File not Saved!')
    return data
#----             

In [3]:
# define files and path
url       = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv'
folder    = r'Datasets'
file_name = 'dataset.xlsx'
file_dir  = folder+'/'+ file_name
# Create file directory
os.makedirs(os.path.dirname(file_dir),exist_ok=True)

# Import dataset
df = importData(file_dir,url)

File loaded!


## Explore Data

In [4]:
# display first five rows of the dataset
df.head(5)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [5]:
# get dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [6]:
# check for null values
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
# check for correlations
df.corr()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
tau1,1.0,0.015586,-0.00597,-0.017265,0.027183,-0.015485,-0.015924,-0.015807,0.010521,0.01535,-0.001279,0.005494,0.275761
tau2,0.015586,1.0,0.014273,-0.001965,-0.004769,0.006573,0.007673,-0.005963,-0.001742,0.015383,0.016508,-0.011764,0.290975
tau3,-0.00597,0.014273,1.0,0.004354,0.016953,-0.003134,-0.00878,-0.017531,-0.011605,0.007671,0.014702,-0.011497,0.2807
tau4,-0.017265,-0.001965,0.004354,1.0,-0.003173,0.010553,0.006169,-0.011211,-0.004149,0.008431,0.00326,-0.000491,0.278576
p1,0.027183,-0.004769,0.016953,-0.003173,1.0,-0.573157,-0.584554,-0.579239,0.000721,0.015405,0.001069,-0.015451,0.010278
p2,-0.015485,0.006573,-0.003134,0.010553,-0.573157,1.0,0.002388,-0.006844,0.015603,-0.018032,0.007555,0.019817,0.006255
p3,-0.015924,0.007673,-0.00878,0.006169,-0.584554,0.002388,1.0,0.012953,-0.003219,-0.011575,-0.005897,-0.010485,-0.003321
p4,-0.015807,-0.005963,-0.017531,-0.011211,-0.579239,-0.006844,0.012953,1.0,-0.013636,0.00285,-0.003515,0.017505,-0.020786
g1,0.010521,-0.001742,-0.011605,-0.004149,0.000721,0.015603,-0.003219,-0.013636,1.0,0.007559,-0.005836,0.012431,0.282774
g2,0.01535,0.015383,0.007671,0.008431,0.015405,-0.018032,-0.011575,0.00285,0.007559,1.0,-0.012809,-0.014909,0.293601


In [8]:
# check values
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [9]:
# get training datasets
x = df.drop(columns=['stab'],axis=0)
y = df['stabf']

In [10]:
# split dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1)

# check
y_train.value_counts()

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [11]:
# encode categorical data
def encode(x_data,y_data):
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    x_data = encoder.fit_transform(x_data)
    y_data = encoder.transform(y_data)
    return x_data,y_data

In [25]:
# imbalance
def balance(x,y):
    import imblearn
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=1)
    x_train_balanced,y_train_balanced = smote.fit_sample(x,y)
    return x,y

In [28]:
# function to normalize
def normalize(train,test,method='Standard',col=''):  
    if(col==''): return
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    #
    if(method == 'Standard'):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
    # normalize
    norm_train_df = scaler.fit_transform(train.drop(columns=[col]))
    norm_test_df  = scaler.transform(test.drop(columns=[col]))
    # get normalized dataframes
    norm_train_df = pd.DataFrame(norm_train_df,columns=train.drop(columns=[col]).columns)
    norm_test_df  = pd.DataFrame(norm_test_df,columns=test.drop(columns=[col]).columns)
    # 
    norm_train_df[col] = train[col]
    norm_test_df[col]  = test[col]
    return norm_train_df,norm_test_df

In [29]:
x_train.stabf,x_test.stabf = encode(x_train.stabf,x_test.stabf)
x_train_bal,y_train_bal = balance(x_train,y_train)
x_test.reset_index(drop=True)
x_train, xtrest = normalize(x_train,x_test,method='Standard',col='stabf')

ModuleNotFoundError: No module named 'imblearn'