In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#read train and test file

file = []
def read_file(path) :
    name = input()
    name = pd.read_csv(path)
    file.append(name)
    
read_file('../input/mobile-price-classification/train.csv')
read_file('../input/mobile-price-classification/test.csv')

In [3]:
train = file[0]
test = file[1]

train.head()

In [4]:
test.shape

In [5]:
train.shape

In [6]:
#Checking Null Values, Duplicated Rows, Outliers, and feature selections

def check(case, df) :
    if case == 'null' :
        return df.isnull().sum()
    elif case == 'duplicate' :
        return df.duplicated().sum()
    elif case == 'outlier' :
        q1 = df.quantile(0.25)
        q3 = df.quantile(0.75)
        iqr = q3-q1
        lower_bound = q1 - 1.5*iqr
        upper_bound = q3 + 1.5*iqr
        print('Upper Bound Value : {}'.format(upper_bound),'\n')
        print('Lower Bound Value : {}'.format(lower_bound))
        
check('null',train)

In [7]:
check('duplicate',train)

In [8]:
check('outlier', train)

In [9]:
train.info()

From our datasets, all of the values are numerics hence we can try to use all features for determining our price_range

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

for a in train.columns :
    fig, ax = plt.subplots(2,1, figsize=(10,12))
    
    sns.distplot(train[a], ax = ax[0])
    sns.boxplot(train[a], ax = ax[1])
    
    ax[0].set_title('Distribution Plot')
    ax[0].set_ylabel('Frequency')
    
    ax[1].set_title('BoxPlot')
    
    plt.suptitle(a.title(), fontsize='20')
    
plt.show()
    
    
    

In [11]:
train.head()

In [12]:
#Check Correlation Between 
for a in train.columns :
    try :
        print(f'Corr {a}: {train[a].corr(train["price_range"])}')
    except :
        pass

In [13]:
#Let's visualize the correlation between features and label 

for a in train.columns :
    if a == 'price_range' :
        pass
    else :
        fig = plt.figure(figsize=(9,6))
        ax= fig.gca()
        feature = train[a]
        label = train['price_range']
        
        corr = feature.corr(label)
        plt.scatter(feature, label)
        plt.xlabel(a.title())
        plt.ylabel('Price Range')
        
        ax.set_title('Price Range Vs ' + a + ' : {}'.format(corr))
        
plt.show()

In [14]:
#Select cols based on correlation values

col_list = ['ram','px_height','px_width','price_range']

train_final = train[col_list]
train_final.head()

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

feature = ['ram','px_height','px_width']
label = ['price_range']

X,y = train[feature].values, train[label].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 42)
#Define Preprocessing for Numeric Columns
num_features = [0,1,2]
num_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers = [
    ('num', num_transformer, num_features)
])

pipeline = Pipeline(steps= [
    ('preprocessing', preprocessor),
    ('logregression', LogisticRegression(C=1/0.1, solver='lbfgs', multi_class = 'auto', max_iter = 10000))
])

model = pipeline.fit(X_train,y_train)
print(model)

In [21]:
#Check accuracy using confusion matrix

prediction = model.predict(X_test)
cm = confusion_matrix(y_test, prediction)
print(pd.DataFrame(cm))

In [22]:
from sklearn.metrics import accuracy_score

print('Accuracy : ', accuracy_score(y_test, prediction))

In [23]:
#Plot Confusion Matrix
price_class = [0,1,2,3]

plt.imshow(cm, interpolation = 'nearest')
plt.colorbar()

tick_marks = np.arange(len(price_class))
plt.xticks(tick_marks, price_class)
plt.yticks(tick_marks, price_class)

plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.show()

In [29]:
print(classification_report(y_test, prediction))

In [32]:
from sklearn.metrics import roc_curve, roc_auc_score

y_score = model.predict_proba(X_test)
print(y_score)

fpr = {}
tpr = {}
tresh = {}

for a in range(len(price_class)) :
    fpr[a], tpr[a], tresh[a] = roc_curve(y_test, y_score[:,a], pos_label = a)
    
    
plt.plot(fpr[0], tpr[0], color = 'blue', linestyle= '--', label = str(price_class[0]) + ' vs Rest')
plt.plot(fpr[1], tpr[1], color = 'red', linestyle= '--', label = str(price_class[1]) + ' vs Rest')
plt.plot(fpr[2], tpr[2], color = 'green', linestyle= '--', label = str(price_class[2]) + ' vs Rest')
plt.plot(fpr[3], tpr[3], color = 'orange', linestyle= '--', label = str(price_class[3]) + ' vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()

In [33]:
auc = roc_auc_score(y_test, y_score, multi_class = 'ovr')
print(auc)