In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder



le = LabelEncoder()

np.random.seed(42)


In [21]:
np.random.randint(10)

3

In [5]:
def get_data(name):
  if name == 'imdb':
    movie_data = pd.read_csv('./Datasets/movie_metadata.csv')
    movie_data.drop_duplicates(inplace=True)
    movie_data.loc[:,'genres'] = movie_data.loc[:,'genres'].apply(lambda x: x.split('|'))

    genres = []
    for x in movie_data.genres:
      for g in x:
        if g not in genres:
          genres.append(g)

    for g in genres:
      movie_data.loc[:,g] = movie_data.loc[:,'genres'].apply(lambda x: int(g in x))

    movie_data.drop(["director_name","actor_2_name","movie_title","genres","actor_1_name","actor_3_name","plot_keywords","movie_imdb_link","cast_total_facebook_likes"],axis=1, inplace = True)
    movie_data.dropna(inplace=True)

    movie_data["imdb_score"] = movie_data["imdb_score"].apply(float)
    movie_data.loc[movie_data['imdb_score'].between(8,10), 'imdb_score'] = 100.0
    movie_data.loc[movie_data['imdb_score'].between(5,7.99), 'imdb_score'] = 50.0
    movie_data.loc[movie_data['imdb_score'].between(0,4.992), 'imdb_score'] = 30.0
    movie_data["imdb_score"] = movie_data["imdb_score"].apply(str)
    movie_data.loc[movie_data['imdb_score'] == "100.0", 'imdb_score'] = "GOOD"
    movie_data.loc[movie_data['imdb_score'] == "50.0", 'imdb_score'] = "AVERAGE"
    movie_data.loc[movie_data['imdb_score'] == "30.0", 'imdb_score'] = "BAD"

    ratings = movie_data["content_rating"].unique()
    for rate in ratings:
        if rate == "M":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "GP":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "PG"
        elif rate == "Unrated":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Not Rated"
        elif rate == "Passed":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "Approved"
        elif rate == "X":
            movie_data.loc[movie_data['content_rating'] == rate, 'content_rating'] = "NC-17"

    movie_data.loc[movie_data['gross'].between(0,15000000.0), 'gross'] = 0.0
    movie_data.loc[movie_data['gross'].between(1500000.01,762000000.0), 'gross'] = 1.0
    movie_data = pd.get_dummies(movie_data,columns=['color','language','country'],drop_first=True)

    target1 = le.fit_transform(movie_data["imdb_score"])
    target2 = le.fit_transform(movie_data["content_rating"])
    target3 = le.fit_transform(movie_data["gross"])
    
    X_final = movie_data.drop(["imdb_score", "content_rating", "content_rating"],axis=1)

  elif name == 'mushroom':
    mushroom_data = pd.read_csv('./Datasets/mushroom.csv')
    mushroom_data = mushroom_data[mushroom_data['stalk-root']!='?']
    mushroom_data = pd.get_dummies(mushroom_data,columns=['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
          'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
          'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
          'stalk-surface-below-ring', 'stalk-color-above-ring',
          'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
          'ring-type', 'spore-print-color'],drop_first=True)
    target1 = le.fit_transform(mushroom_data["Class"])
    target2 = le.fit_transform(mushroom_data["population"])
    target3 = le.fit_transform(mushroom_data["habitat"])
    X_final = mushroom_data.drop(["Class", "population", "habitat"],axis=1)
  
  elif name == 'census':
    census_data = pd.read_csv('./Datasets/census.csv',names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']) 
    census_data=census_data.drop_duplicates()
    for col in census_data.columns:
      census_data = census_data[census_data[col]!=' ?']
    census_data.loc[census_data.income==' <=50K.','income'] = ' <=50K' 
    census_data.loc[census_data.income==' >50K.','income'] = ' >50K'
    census_data = pd.get_dummies(census_data,columns=['education','occupation','relationship','race','sex','native-country'],drop_first=True)
    target1 = le.fit_transform(census_data["income"])
    target2 = le.fit_transform(census_data["marital-status"])
    target3 = le.fit_transform(census_data["workclass"])
    X_final = census_data.drop(["income", "marital-status", "workclass"],axis=1)
  
  elif name == 'bank':
    bank_data = pd.read_csv('./Datasets/bank-additional.csv')
    for col in bank_data.columns:
      bank_data = bank_data[bank_data[col]!='unknown']
    bank_data = pd.get_dummies(bank_data,columns=['job','education','default','contact','month','day_of_week','poutcome','marital'],drop_first=True)
    target1 = le.fit_transform(bank_data["y"])
    target2 = le.fit_transform(bank_data["loan"])
    target3 = le.fit_transform(bank_data["housing"])
    X_final = bank_data.drop(["y", "loan", "housing"],axis=1)

  elif name == 'intention':
    intention_data = pd.read_csv('./Datasets/online_shoppers_intention.csv')
    intention_data = intention_data.drop_duplicates()
    intention_data = intention_data[intention_data['VisitorType']!='Other']
    intention_data = pd.get_dummies(intention_data,columns=['Month','Weekend'],drop_first=True)        
    target1 = le.fit_transform(intention_data["Revenue"])
    target2 = le.fit_transform(intention_data["VisitorType"])
    target3 = le.fit_transform(intention_data["SpecialDay"])
    X_final = intention_data.drop(["Revenue", "VisitorType", "SpecialDay"],axis=1)

  elif name == 'anuran':
    anuran_data = pd.read_csv('./Datasets/Frogs_MFCCs.csv')
    anuran_data.drop(columns='RecordID',inplace=True)
    target1 = le.fit_transform(anuran_data["Family"])
    target2 = le.fit_transform(anuran_data["Genus"])
    target3 = le.fit_transform(anuran_data["Species"])
    X_final = anuran_data.drop(["Family", "Genus", "Species"],axis=1)
  
  elif name == 'telco':
    telco_data = pd.read_csv('./Datasets/telco.csv')
    telco_data.drop(columns=['customerID'],inplace=True)
    telco_data = telco_data.drop_duplicates()
    telco_data = telco_data[telco_data.TotalCharges!=' ']
    telco_data = pd.get_dummies(telco_data,columns=['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling'],drop_first=True)
    target1 = le.fit_transform(telco_data["Churn"])
    target2 = le.fit_transform(telco_data["Contract"])
    target3 = le.fit_transform(telco_data["PaymentMethod"])
    X_final = telco_data.drop(["Churn", "Contract", "PaymentMethod"],axis=1)

  elif name == 'paris':
    paris_data = pd.read_csv('./Datasets/ParisHousingClass.csv')
    paris_data.drop(columns='made',inplace=True)
    target1 = le.fit_transform(paris_data["category"])
    target2 = le.fit_transform(paris_data["isNewBuilt"])
    target3 = le.fit_transform(paris_data["hasStorageRoom"])
    X_final = paris_data.drop(["category", "isNewBuilt", "hasStorageRoom"],axis=1)
  
  elif name == 'smoking':
    smoking_data = pd.read_csv('./Datasets/smoking.csv')
    smoking_data.drop(columns=['ID','oral'],inplace=True)
    smoking_data = smoking_data.drop_duplicates()
    smoking_data = pd.get_dummies(smoking_data,columns=['gender'],drop_first=True)
    target1 = le.fit_transform(smoking_data["smoking"])
    target2 = le.fit_transform(smoking_data["tartar"])
    target3 = le.fit_transform(smoking_data["dental caries"])
    X_final = smoking_data.drop(["smoking", "tartar", "dental caries"],axis=1)
  
  elif name == 'flight':
    flight_data = pd.read_csv('./Datasets/flight.csv')
    flight_data.dropna(inplace=True)
    flight_data = pd.get_dummies(flight_data,columns=['Gender','Type of Travel'],drop_first=True)
    target1 = le.fit_transform(flight_data["satisfaction"])
    target2 = le.fit_transform(flight_data["Customer Type"])
    target3 = le.fit_transform(flight_data["Class"])
    X_final = flight_data.drop(["satisfaction", "Customer Type", "Class"],axis=1)
  
  else:
    raise ValueError('Incorrect dataset')
  
  return X_final, target1, target2, target3

In [None]:
def plotDist(arr, title):
  values, counts = np.unique(arr, return_counts=True)
  lis = np.asarray((values, counts)).T
  x, y = zip(*lis) # unpack a list of pairs into two tuples
  # plt.plot(x, y)
  plt.bar(x,y)

  plt.title(title)
  # plt.show()

plt.figure(figsize=(20, 50), dpi=80)

data = json.load(open('data_structure.json'))
dataKeys = list(data.keys())
for i in range(len(data)):
  key = dataKeys[i]
  dataSet = data[key]
  target, t1, t2, t3 = get_data(key)
  
  plt.subplot(10, 3, i*3+1)
  plotDist(t1, dataSet['t1'])
  plt.subplot(10, 3, i*3+2)
  plotDist(t2, dataSet['t2'])
  plt.subplot(10,3, i*3+3)
  plotDist(t3, dataSet['t3'])
plt.show()


## BaseLine Classification


In [69]:

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

def getClassificationReports(dataSet):
  print(f'--------------------------[ Classification Report for {dataSet} ]--------------------------')
  df, target1, target2, target3 = get_data(dataSet)

  df['target1'] = target1
  df['target2'] = target2
  df['target3'] = target3

  train, test = train_test_split(df, test_size=0.2)

  targets = ['target1', 'target2', 'target3']

  for t in targets:
    # Get the Target from the training data
    trainTarget = train[t]
    # Remove it from the training data
    train.drop([t], axis=1)

    # Get the Target from the testing data
    testTarget = test[t]
    # Remove it from the training data
    test.drop([t], axis=1)

    # Create and fit a gaussian Model
    gaussian = GaussianNB()
    gaussian.fit(train, trainTarget)
    
    # Generate predictions
    predictions = gaussian.predict(test)

    # print the classification report
    print(classification_report(testTarget, predictions))



data = json.load(open('data_structure.json'))
dataKeys = list(data.keys())
for i in range(len(data)):
  key = dataKeys[i]
  getClassificationReports(key)





--------------------------[ Classification Report for imdb ]--------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.86      1.00      0.93       646
           1       0.00      0.00      0.00        69
           2       0.00      0.00      0.00        33

    accuracy                           0.86       748
   macro avg       0.29      0.33      0.31       748
weighted avg       0.75      0.86      0.80       748



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00       130
           5       0.44      0.27      0.33       257
           6       0.49      0.88      0.63       328

    accuracy                           0.48       748
   macro avg       0.13      0.16      0.14       748
weighted avg       0.36      0.48      0.39       748

              precision    recall  f1-score   support

           0       0.59      0.69      0.64       258
           1       0.82      0.75      0.78       490

    accuracy                           0.73       748
   macro avg       0.71      0.72      0.71       748
weighted avg       0.74      0.73      0.73       748

--------------------------[ Classification Report for mushroom ]------------

KeyboardInterrupt: 