In [1]:
import numpy as np
import pandas as pd

In [2]:
arff_file_path = str('../../data/Amazon_initial_50_30_10000.arff')

In [3]:
# Reading the arff file and checking for missing or invalid data
def load_arff_data(arff_file_path):
    attributes = []
    classes = []
    data = []
    duplicate_attributes = []
    with open(arff_file_path, 'r') as af:
        reading_data = False
        attribute_index = 0
        while True:
            line = af.readline()
            if not line:
                break
            line = line.replace('\n', '')
            words = line.split(' ')
            if words[0] == '@data':
                reading_data = True
                continue
            elif words[0] == '@attribute':
                if words[1] == 'class':
                    classes = [c for c in words[2].replace('{','').replace('}','').split(',')]
                    attributes.append('class')
                    # check if each class has a unique name
                    if len(classes) != len(set(classes)):
                        print("There are duplicates in the list of classes")
                else:
                    attribute = " ".join(words[1:])
                    if attribute in attributes:
                        print(attribute)
                        duplicate_attributes(attribute_index)
                    else:
                        attributes.append(attribute)
                attribute_index += 1
                continue
            if reading_data:
                values = words[0].split(',')
                if values[-1] not in classes:
                    # if class value is not in the classes list => skip
                    print(values[-1] + " is not one of the classes")
                    continue
                # remove values fro duplicated attributes
                values = [x for i, x in enumerate(values) if i not in duplicate_attributes]
                # if a value is None, none or NONE set it to 0
                data.append([0 if str(v).lower() == 'none' else int(v) for v in values[:-1]]+[values[-1]])
    return (data, attributes, classes)

In [4]:
data, attributes, classes = load_arff_data(arff_file_path)
df = pd.DataFrame(data, columns=attributes)

In [5]:
df.head()

Unnamed: 0,the numeric,and numeric,a numeric,of numeric,to numeric,is numeric,I numeric,in numeric,that numeric,it numeric,...,ra_ numeric,le_to numeric,bra numeric,uch_a numeric,ave_a numeric,n_in_ numeric,ied_ numeric,nd_b numeric,rso numeric,class
0,5,3,4,4,1,5,1,3,2,4,...,0,0,0,1,0,1,0,0,0,Agresti
1,12,3,6,2,3,4,2,0,1,3,...,0,0,5,0,1,0,0,0,0,Agresti
2,3,2,2,4,4,2,2,2,3,1,...,0,0,6,0,0,0,0,1,0,Agresti
3,18,4,6,5,4,2,1,0,4,3,...,0,0,0,0,0,0,0,0,0,Agresti
4,13,4,7,5,4,5,0,1,0,4,...,0,0,1,0,0,1,1,0,0,Agresti


In [6]:
df.describe().round(2)

Unnamed: 0,the numeric,and numeric,a numeric,of numeric,to numeric,is numeric,I numeric,in numeric,that numeric,it numeric,...,ompa numeric,ra_ numeric,le_to numeric,bra numeric,uch_a numeric,ave_a numeric,n_in_ numeric,ied_ numeric,nd_b numeric,rso numeric
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,11.78,7.47,6.72,6.53,6.19,4.62,3.88,3.59,2.93,2.84,...,0.21,0.21,0.21,0.2,0.2,0.2,0.2,0.2,0.2,0.2
std,5.65,3.57,3.37,3.86,3.37,2.86,4.05,2.35,2.11,2.7,...,0.54,0.61,0.5,0.64,0.51,0.48,0.48,0.5,0.47,0.54
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,5.0,4.0,4.0,4.0,2.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11.0,7.0,6.0,6.0,6.0,4.0,3.0,3.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.0,10.0,9.0,9.0,8.0,6.0,6.0,5.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,32.0,21.0,20.0,25.0,24.0,15.0,21.0,15.0,13.0,19.0,...,4.0,6.0,4.0,8.0,5.0,4.0,4.0,5.0,3.0,4.0


In [7]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()

In [12]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)