In [1]:
import pandas as pd
import numpy as np
import io
import category_encoders as ce


Reading data from the uploaded file.

In [2]:
def read_data(file_name):
  
  df = pd.read_csv(file_name)
  return df

Cleaning up the columns in the data.

In [3]:
def clean_data(df):

    drop_cols = ["RESTAURANT_ID", "LOCALITY", "TIME"]
    df = df.drop(drop_cols, axis=1)

    print("Dropped the columns {}\n".format(str(drop_cols)))


    for col in df.columns.tolist():
        print("Column: {0}, Unique values: {1}".format(col, len(df[col].value_counts())))

    df["VOTES"] = df["VOTES"].fillna("0")
    df["VOTES"] = df["VOTES"].map(lambda x: x.rstrip(" votes")).astype("int")
    df["VOTES"] -= df["VOTES"].mean()
    df["VOTES"] = df["VOTES"] / ((df["VOTES"]**2).sum() / df.shape[0])

    print("\nProcessed column VOTES by removing the unwanted characters and converting it to int type\n")
    
    df["RATING"] = df["RATING"].str.replace("NEW", "0").str.replace("-", "0")
    df["RATING"] = df["RATING"].fillna("0")
    df["RATING"] = df["RATING"].astype("float")
    df["RATING"] /= 5.0
    print("Processed column RATING by replacing NA values and casting them to int type.\n")

    encoder = ce.BinaryEncoder(cols=["TITLE", "CUISINES", "CITY"])
    df = encoder.fit_transform(df)
    print("Processed columns TITLE, CUISINES, CITY by converting them from categorical to numerical values with binary encoding.\n")
  
    print("Dropping the zero columns")
    df = df.drop(["TITLE_0", "CUISINES_0", "CITY_0"], axis=1)
    return df


Cleaning the training data.

In [9]:
df_train = read_data("./dataset/Data_Train.csv")
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12690 entries, 0 to 12689
Data columns (total 9 columns):
TITLE            12690 non-null object
RESTAURANT_ID    12690 non-null int64
CUISINES         12690 non-null object
TIME             12690 non-null object
CITY             12578 non-null object
LOCALITY         12592 non-null object
RATING           12688 non-null object
VOTES            11486 non-null object
COST             12690 non-null int64
dtypes: int64(2), object(7)
memory usage: 892.3+ KB


Cleaning and downloading the testing data.

In [10]:
df_test = read_data("./dataset/Data_Test.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231 entries, 0 to 4230
Data columns (total 8 columns):
TITLE            4231 non-null object
RESTAURANT_ID    4231 non-null int64
CUISINES         4231 non-null object
TIME             4231 non-null object
CITY             4196 non-null object
LOCALITY         4201 non-null object
RATING           4229 non-null object
VOTES            3829 non-null object
dtypes: int64(1), object(7)
memory usage: 264.5+ KB


In [6]:
df = df_train.append(df_test, ignore_index=True, sort=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16921 entries, 0 to 16920
Data columns (total 9 columns):
TITLE            16921 non-null object
RESTAURANT_ID    16921 non-null int64
CUISINES         16921 non-null object
TIME             16921 non-null object
CITY             16774 non-null object
LOCALITY         16793 non-null object
RATING           16917 non-null object
VOTES            15315 non-null object
COST             12690 non-null float64
dtypes: float64(1), int64(1), object(7)
memory usage: 1.2+ MB


In [7]:
df_cleaned = clean_data(df)
df_cleaned["COST"] /= 10000

print("Cleaned data:\n")
df_cleaned.info()

Dropped the columns ['RESTAURANT_ID', 'LOCALITY', 'TIME']

Column: TITLE, Unique values: 123
Column: CUISINES, Unique values: 5183
Column: CITY, Unique values: 450
Column: RATING, Unique values: 32
Column: VOTES, Unique values: 2075
Column: COST, Unique values: 86

Processed column VOTES by removing the unwanted characters and converting it to int type

Processed column RATING by replacing NA values and casting them to int type.

Processed columns TITLE, CUISINES, CITY by converting them from categorical to numerical values with binary encoding.

Dropping the zero columns
Cleaned data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16921 entries, 0 to 16920
Data columns (total 32 columns):
TITLE_1        16921 non-null int64
TITLE_2        16921 non-null int64
TITLE_3        16921 non-null int64
TITLE_4        16921 non-null int64
TITLE_5        16921 non-null int64
TITLE_6        16921 non-null int64
TITLE_7        16921 non-null int64
CUISINES_1     16921 non-null int64
CUISINES_

In [8]:
df_cleaned.to_csv("processed_data.csv")