In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import matplotlib.pyplot as plt
from numpy.core.defchararray import add
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
np.random.seed(0) 
df = pd.read_csv("data/Android/googleplaystore.csv")

In [None]:
#Drop Duplicate along the App Column subset and dropna for columns with small percentage of missing value
df.drop_duplicates(subset='App',inplace = True)
df.dropna(subset=['Type', 'Content Rating','Current Ver','Android Ver'],inplace = True)

In [None]:
#Merge Category and Genre in one column
df_ex = ((((df.Category.str.lower()).str.split("_")).str.join(' ')).str.split("and")).str.join('&')
df_gx = ((df.Genres.str.lower()).str.split(";"))
listx=[]
for ex,gx in zip(df_ex,df_gx):
    if ex != gx[0]:
        if not (ex in gx[0]):
            gx.append(ex)
            if "educational" in gx:
                gx.remove("educational")
                gx.append("education")
    listx.append(np.unique(gx))
df["Genres"] = np.asarray(listx)

In [None]:
#Remove Column Category, Price, Last Updated,Current Ver, Android Ver for there irrelvance
df = df.drop(["Category","Price","Last Updated","Current Ver","Android Ver"],axis=1)

In [None]:
# Correct and convert size column to int
def change_size(size):
    if 'M' in size:
        x = size[:-1]
        x = float(x)*1000000
        return(x)
    elif 'k' == size[-1:]:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    else:
        return None

df["Size"] = df["Size"].map(change_size)

df.Size.fillna(method = 'ffill', inplace = True)

In [None]:
# Convert installs to int
df['Installs'] = [int(i[:-1].replace(',','')) for i in df['Installs']]

In [None]:
#-----Remove unrated and label code the content rating
df = df.drop(df[df["Content Rating"]=="Unrated"].index)
RatingL = df['Content Rating'].unique()
RatingDict = {}
for i in range(len(RatingL)):
    RatingDict[RatingL[i]] = i
df['Content Rating'] = df['Content Rating'].map(RatingDict).astype(int)

In [None]:
#Clean Type
def type_clean(types):
    if "Free" in types:
        return 0
    else:
        return 1

df['Type'] = df['Type'].map(type_clean).astype(int)

In [None]:
#Convert Reviews to int
df['Reviews'] = df['Reviews'].astype(int)

In [None]:
#Convert non-null Ratings to float
df['Rating'] = df['Rating'].astype(float)

In [None]:
#Fix indexing
new_index = np.arange(0,len(df))
df = df.set_index(new_index)

In [None]:
#One Hot encode all genres
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('Genres')),
                          columns=mlb.classes_,
                          index=df.index))

In [None]:
df.head()

In [None]:
df.plot.scatter('Rating','Reviews')

In [None]:
df.plot.scatter('Rating','Size')

In [None]:
df.plot.scatter('Rating','Installs')

In [None]:
from sklearn.preprocessing import MinMaxScaler
df_scaled = df.copy().dropna()
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled[['Rating','Reviews']] = scaler.fit_transform(df_scaled[['Rating','Reviews']])
df_scaled[['Rating','Reviews']].head()

In [None]:
df_scaled[['Rating','Size']] = scaler.fit_transform(df_scaled[['Rating','Size']])
df_scaled[['Rating','Size']].head()

In [None]:
df_scaled[['Rating','Installs']] = scaler.fit_transform(df_scaled[['Rating','Installs']])
df_scaled[['Rating','Installs']].head()

In [None]:
df_scaled.plot.scatter('Rating','Reviews')

In [None]:
df_scaled.plot.scatter('Rating','Size')

In [None]:
df_scaled.plot.scatter('Rating','Installs')

In [None]:
X1_a = df_scaled['Rating'].values.reshape(-1,1) 
X2_a = df_scaled['Reviews'].values.reshape(-1,1)
# values method takes a column from pandas dataframe and makes it into an array of values --
# reshape arranges the extracted values in a numpy array with a shape we define --
# an array shape is the number of rows and columns in the array
# when we use parameter -1, it means we don't know the shape of the resulting array (e.g. X1)  -- 
# and numpy will figure it out based on length of array and whatever dimensions left
# in the case of reshape(-1, 1) we tell numpy to figure out the number of rows (parameter -1) but we have one column (parameter 1)

X_a = np.concatenate((X1_a,X2_a),axis=1) # define a NumPy array from the two arrays

In [None]:
X1_b = df_scaled['Rating'].values.reshape(-1,1) 
X2_b = df_scaled['Size'].values.reshape(-1,1)
# values method takes a column from pandas dataframe and makes it into an array of values --
# reshape arranges the extracted values in a numpy array with a shape we define --
# an array shape is the number of rows and columns in the array
# when we use parameter -1, it means we don't know the shape of the resulting array (e.g. X1)  -- 
# and numpy will figure it out based on length of array and whatever dimensions left
# in the case of reshape(-1, 1) we tell numpy to figure out the number of rows (parameter -1) but we have one column (parameter 1)

X_b = np.concatenate((X1_b,X2_b),axis=1) # define a NumPy array from the two arrays

In [None]:
X1_c = df_scaled['Rating'].values.reshape(-1,1) 
X2_c = df_scaled['Installs'].values.reshape(-1,1)
# values method takes a column from pandas dataframe and makes it into an array of values --
# reshape arranges the extracted values in a numpy array with a shape we define --
# an array shape is the number of rows and columns in the array
# when we use parameter -1, it means we don't know the shape of the resulting array (e.g. X1)  -- 
# and numpy will figure it out based on length of array and whatever dimensions left
# in the case of reshape(-1, 1) we tell numpy to figure out the number ofdf_scaled rows (parameter -1) but we have one column (parameter 1)

X_c = np.concatenate((X1_c,X2_c),axis=1) # define a NumPy array from the two arrays

In [None]:
# Import models
from pyod.models.hbos import HBOS # histogram-based outlier detection module
from pyod.models.cblof import CBLOF # cluster-based local outlier factor detection module
from pyod.models.knn import KNN # k nearest neighbors module
from pyod.models.lof import LOF # local outlier factor module


random_state = np.random.RandomState(42)
outliers_fraction = 0.05
# Define four outlier detection tools to be compared
# KNN uses the data point's distance to the farthest KNN for the outlier score
# Average KNN uses the average score for the data point's k nearest neighbors as the outlier score

# Create dictionary structure with model names and function calls
classifiers = {
#     'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),    
#     'Local Outlier Factor (LOF)':LOF(contamination=outliers_fraction),
    #'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
    'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
    #'Average KNN': KNN(method='mean',contamination=outliers_fraction)
}

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
# For Box-Cox Normalization
from scipy import stats



# # Define a grid with equally spaced cells using NumPy for visualization
# xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200))

# for iterator, call each model name and model function from the dictionary
for i, (clf_name, clf) in enumerate(classifiers.items()):
    
    # Fit the model in current iteration to data X which contains the two scaled attributes
    clf.fit(X_a) 
    
    # Predict raw anomaly score for entire dataset X using decision function
    # Decision function is a conceptual separator between two classes in the dataset, in this case "normal" versus "outlier"
    scores_pred = clf.decision_function(X_a) * -1 
        
    # Label each data point in X as outlier or inlier: label = 1 is for outlier and label = 0 is for inlier
    y_pred = clf.predict(X_a)
    

y_pred = np.array(y_pred)    
outlier_index = np.asarray(np.where(y_pred == 1)).flatten()
# outlier_index.reshape((1, -1))

for i in range(len(outlier_index)):
    print(outlier_index[i])

In [None]:
for i, (clf_name, clf) in enumerate(classifiers.items()):
    
    # Fit the model in current iteration to data X which contains the two scaled attributes
    clf.fit(X_b) 
    
    # Predict raw anomaly score for entire dataset X using decision function
    # Decision function is a conceptual separator between two classes in the dataset, in this case "normal" versus "outlier"
    scores_pred = clf.decision_function(X_b) * -1 
        
    # Label each data point in X as outlier or inlier: label = 1 is for outlier and label = 0 is for inlier
    y_pred = clf.predict(X_b)
    

y_pred = np.array(y_pred)    
outlier_index1 = np.asarray(np.where(y_pred == 1)).flatten()
# outlier_index1.reshape((1, -1))

for i in range(len(outlier_index1)):
    print(outlier_index1[i])

In [None]:
for i, (clf_name, clf) in enumerate(classifiers.items()):
    
    # Fit the model in current iteration to data X which contains the two scaled attributes
    clf.fit(X_c) 
    
    # Predict raw anomaly score for entire dataset X using decision function
    # Decision function is a conceptual separator between two classes in the dataset, in this case "normal" versus "outlier"
    scores_pred = clf.decision_function(X_c) * -1 
        
    # Label each data point in X as outlier or inlier: label = 1 is for outlier and label = 0 is for inlier
    y_pred = clf.predict(X_c)
    

y_pred = np.array(y_pred)    
outlier_index2 = np.asarray(np.where(y_pred == 1)).flatten()
# outlier_index2.reshape((1, -1))

for i in range(len(outlier_index2)):
    print(outlier_index2[i])
    
print(type(outlier_index2[1]))


In [None]:
concat_index = np.concatenate((outlier_index, outlier_index1, outlier_index2))
len(concat_index)

In [None]:
concat_index = np.unique(concat_index)
len(concat_index)

In [None]:
len(df)

In [None]:
df = df.drop(concat_index)

In [None]:
#Fix indexing
new_index = np.arange(0,len(df))
df = df.set_index(new_index)

In [None]:
df

In [None]:
df.plot.scatter('Rating','Reviews')

In [39]:
#Checking for any missing values
missing_values_count = df.isnull().sum()
missing_values_count

App                           0
Rating                     1347
Reviews                       0
Size                          0
Installs                      0
                           ... 
travel & local                0
trivia                        0
video players & editors       0
weather                       0
word                          0
Length: 61, dtype: int64

In [40]:
#Impute the missing nans in Rating using multivariate imputation
reg_imputer = IterativeImputer(BayesianRidge(), max_iter=5, random_state=0)
subset_data = df.loc[:, 'Rating':'word']
imputed_subset = pd.DataFrame(reg_imputer.fit_transform(subset_data), columns=subset_data.columns)
df.loc[:,'Rating':'word'] = imputed_subset.loc[:,'Rating':'word']



In [41]:
#Checking for any missing values
missing_values_count = df.isnull().sum()
missing_values_count

App                        0
Rating                     0
Reviews                    0
Size                       0
Installs                   0
                          ..
travel & local             0
trivia                     0
video players & editors    0
weather                    0
word                       0
Length: 61, dtype: int64