In [1]:
#library for eda and computations
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',200,'display.max_rows',200)

#library for visualizations

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
sns.set_style(style='darkgrid')

#stop warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
test=pd.read_csv('test.csv')
train=pd.read_csv('train.csv')

In [3]:
#Keeping our target column
target=[feat for feat in train.columns if feat not in test.columns]
target=train[target]

In [4]:
#drop the target column from the data frame
train=train.drop('y',axis=1)

In [6]:
ntrain,ntest=train.shape[0],test.shape[0]

In [7]:
"""Function for renaming the column due to JSON character"""

import re #Regexp Library

def columns_rename(data):
    """This function works for columns with two or more string
    i.e 'galactic year','Intergalactic Development Index"""
    new_col = []
    new_idx = []
    new_co = []
    galaxy = data.galaxy
    feat = [feat for feat in data.columns if data[feat].dtypes != 'object']
    for i in range(len(feat)):
        review = re.sub('[^a-zA-z]', ' ', feat[i])
        review = review.split()
        done_review = review[0] + ' ' + review[1]
        if len(review) >= 3:
            done_review = done_review + ' ' + review[2]
            if review[-1] not in done_review:
                done_review = done_review + ' ' + review[-1]
                new_col.append(done_review)
            else:
                new_col.append(done_review)
        else:
            new_col.append(done_review)
    new_col = pd.Index(new_col)
    for i in range(len(new_col)):
        if new_col.duplicated()[i] == True:
            new_idx.append(str(new_col[i]) +'_1')
        else:
            new_idx.append(str(new_col[i]))
    new_idx = pd.Index(new_idx)
    for i in range(len(new_idx)):
        if new_idx.duplicated()[i] == True:
            new_co.append(str(new_idx[i]) +'_1')
        else:
            new_co.append(str(new_idx[i]))
    if len(new_co) != len(feat):
        print('There is an error from your dataset')
    else:
        pass
    data[new_co] = data[feat]
    data_frame = data[new_co]
    data_frame['galaxy'] = galaxy
    return data_frame


In [8]:
new_train=columns_rename(train)
new_train['target']=target

In [9]:
new_test=columns_rename(test)

## This Features where gotten after running feature importance to create a lag feature

In [10]:
imp_feat=['Education Index','Expected years of','Gross capital formation GGP','Gross income per capita','Income Index','Intergalactic Development Index IDI',
          'Intergalactic Development Index Rank','Interstellar Data Net population','Mean years of','Population using at services',
          'Population using at services_1','Youth unemployment rate ratio','existence expectancy at birth','existence expectancy index','galactic year']

In [11]:
ntrain,ntest=new_train.shape[0], new_test.shape[0]

In [12]:
df = pd.concat([new_train, new_test]).reset_index(drop=True)

In [13]:
"""Creating a Unique Column for the galaxy and galactic year to form a unique id"""
def unique_id(data):
    """data should be the dataframe you want to iterate"""
    uni_id = []
    for i in range(len(data)):
        gal_year = data['galactic year'][i]
        gal_new = str(data.galaxy[i].split()[0]) + '_' +str(gal_year)
        uni_id.append(gal_new)
    data['unique_id'] = uni_id
    return 'Column Created'
    

In [14]:
unique_id(df)

'Column Created'

In [15]:
#Check the number of unique classes
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    print(f"Number of classes in {col}")
    print(df[col].nunique())
    print('--------------------------')

Number of classes in galaxy
181
--------------------------
Number of classes in unique_id
1761
--------------------------


In [16]:
missing_value=[(100*(df.isna().sum()/df.shape[0]))<10]

In [17]:
ID_COL, TARGET_COL = 'unique_id', 'target'
features = [c for c in df.columns if c not in ['unique_id', 'target']]

### Frequency Encoding

We cannot use the Unique Id directly since its different for both train and test data, but we can use the the number of times each place appears in the dataframe is not same for every place.

In [18]:
df['unique_id_freq'] = df['unique_id'].map(df['unique_id'].value_counts())

In [19]:
lag_features = imp_feat


for feat in lag_features:
    for i in range(7):
        df[feat+'_lag'+str(i+1)] = df.groupby(['unique_id'])[feat].shift(-(i+1)).reset_index()[feat] 

In [20]:
for i in tqdm_notebook(range(1, 15)):
    df[f'magic_{i}'] = df.sort_values(by='galactic year')[TARGET_COL].shift(i).expanding().mean().fillna(method='ffill').sort_index()
    df[f'magic2_{i}'] = df.sort_values(by='galactic year')[TARGET_COL].shift(-i).expanding().mean().fillna(method='bfill').sort_index()

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




In [21]:
df.to_csv('new_dataset.csv',index=False)

## You can Explore the dataset for more features