In [1]:
# importing libraries

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from scipy.optimize import minimize
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# Importing Data
pd.set_option('display.max_columns',None)

# training data
train = pd.read_csv('removed_corr_features_train.csv')

# test data
test = pd.read_csv('scaled_test.csv')
df=pd.concat([train,test], sort=False)
train.head()



Unnamed: 0,galactic year,galaxy,existence expectancy index,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),"Intergalactic Development Index (IDI), Rank",Population using at least basic drinking-water services (%),Population using at least basic sanitation services (%),Gross capital formation (% of GGP),"Population, total (millions)","Population, urban (%)","Mortality rate, under-five (per 1,000 live births)",Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64)),Young age (0-14) dependency ratio (per 100 creatures ages 15-64),"Adolescent birth rate (births per 1,000 female creatures ages 15-19)",Total unemployment rate (female to male ratio),Vulnerable employment (% of total employment),"Unemployment, total (% of labour force)",Employment in agriculture (% of total employment),Labour force participation rate (% ages 15 and older),"Labour force participation rate (% ages 15 and older), female","Labour force participation rate (% ages 15 and older), male",Jungle area (% of total land area),"Share of employment in nonagriculture, female (% of total employment in nonagriculture)",Youth unemployment rate (female to male ratio),"Unemployment, youth (% ages 15–24)","Mortality rate, male grown up (per 1,000 people)","Infants lacking immunization, red hot disease (% of one-galactic year-olds)","Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)","Gross galactic product (GGP), total","Outer Galaxies direct investment, net inflows (% of GGP)",Exports and imports (% of GGP),Share of seats in senate (% held by female),Natural resource depletion,"Maternal mortality ratio (deaths per 100,000 live births)",Renewable energy consumption (% of total final energy consumption),"Estimated gross galactic income per capita, female",Rural population with access to electricity (%),Domestic credit provided by financial sector (% of GGP),"Remittances, inflows (% of GGP)","Gross enrolment ratio, primary (% of primary under-age population)","Respiratory disease incidence (per 100,000 people)",Interstellar phone subscriptions (per 100 people),"Interstellar Data Net users, total (% of population)",Current health expenditure (% of GGP),Gender Development Index (GDI),Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
0,990025,96,-1.502033,-0.241384,-0.922203,-1.792495,-1.439277,-1.475295,1.368197,-1.427295,-1.505689,1.192959,-0.384152,-2.896943,1.11731,-1.139819,0.922369,1.49908,0.260803,1.674453,-0.685651,1.849428,1.054639,0.689971,0.464296,1.279038,-0.038742,0.322454,-0.763565,1.298043,-1.305901,-1.355333,-0.126713,-0.522262,-1.173157,-0.263583,-0.962916,2.41777,2.230747,-0.678917,-1.600281,-1.064585,0.286828,-0.748021,1.205582,-0.822818,-0.929806,0.014081,-0.589526,0.184573,1.020473,0.21733,1.165783,0.05259
1,990025,33,-0.335081,-0.077629,0.142624,-1.114988,-1.652066,0.226422,0.291634,-0.00047,-0.01307,0.814087,-0.838002,0.693868,0.382949,-1.019545,0.441068,-0.125306,0.835759,-0.15936,0.161021,-0.208982,-1.289865,-1.85507,1.794636,-1.317682,-2.3447,0.812596,0.112821,0.327776,0.356616,0.196949,-0.252031,0.163119,-1.707048,-1.075334,-0.242879,-0.068946,-0.44061,-0.600156,-0.035494,-1.514391,-1.019477,0.522195,-0.291985,-0.618198,-1.064926,-1.030153,-0.958876,-0.132206,-0.115992,-0.051129,0.966971,0.059868
2,990025,178,-1.312374,-1.241168,-1.677564,-1.625361,-1.394316,-1.853442,1.263906,-2.070158,-2.166473,-2.542581,-1.081589,-1.006463,2.804737,-0.482334,1.698207,2.903376,-0.656899,1.672347,-1.329933,1.660636,1.241614,0.887944,2.057229,-0.398423,0.247112,-0.715094,-1.693266,2.542889,-0.023531,-0.052327,-0.83666,-0.552734,-1.405042,-0.431258,0.002279,3.49723,2.54583,-1.253852,-2.233278,-1.051453,-1.404469,-1.371133,1.213732,-1.848547,-1.437951,0.379403,-1.472998,0.087905,0.229678,0.802142,1.720109,0.050449
3,990025,163,-1.950487,0.143511,-1.531104,-2.706783,-1.779094,-2.192204,1.70041,-3.478277,-2.609325,-1.349575,-1.008319,-1.525368,3.965829,-1.040586,0.894599,2.110267,-0.06313,2.414199,-1.397177,2.334208,1.812552,1.515996,0.543388,-0.250277,0.08651,-0.639854,-1.492845,2.879,3.340414,3.544285,-0.895626,-0.137467,-0.54047,-0.832887,1.707579,5.215544,2.586768,-0.275602,-2.871496,-0.632834,-0.866392,-3.978544,2.138354,-1.928471,-0.924833,-0.516899,-2.640633,-2.135965,-0.003672,0.367055,1.710022,0.049394
4,990025,155,0.731386,2.646683,1.582914,-0.243095,0.935054,0.664158,-1.076832,1.240322,0.965492,0.489152,-0.468705,1.456722,-1.579841,1.283474,-1.236648,-1.352118,0.658214,-1.618296,-0.180395,-1.643086,-1.694591,0.064101,-1.126899,0.378561,0.361532,0.629391,0.022214,-1.43471,-0.216905,-1.001825,0.062769,1.200834,3.465428,0.563602,0.202044,-1.188468,-1.041995,2.498996,0.981613,1.610696,-0.743654,-0.809824,-0.628586,1.589276,2.004523,0.222772,0.981315,1.219614,-0.116418,-1.421962,-1.593735,0.154247


In [4]:
train.shape

(3865, 54)

In [5]:
test.shape

(890, 53)

In [6]:
train = df[:3865]
test = df[3865:]
test=test.drop("y", axis = 1)
test_res= test.copy()

In [7]:
# From train set
train_gal=set(train["galaxy"])
s=0
for x in train_gal:
    s=s+len(train.loc[train['galaxy'] == x])
print("Total distinct galaxies: {}".format(len(train_gal)))
print("Average samples per galaxy: {}".format(s/len(train_gal)))

Total distinct galaxies: 181
Average samples per galaxy: 21.353591160220994


In [8]:
# unique galaxies foem test.csv
test_gal=set(test["galaxy"])
s=0
for x in test_gal:
    s=s+len(test.loc[test['galaxy'] == x])
print("Total distinct galaxies: {}".format(len(test_gal)))
print("Average samples per galaxy: {}".format(s/len(test_gal)))

Total distinct galaxies: 172
Average samples per galaxy: 5.174418604651163


In [9]:
print("Train Data shape :", train.shape)
print("Test Data shape :", test.shape)

Train Data shape : (3865, 54)
Test Data shape : (890, 53)


In [10]:
def cross_validation_loop(data,cor):
    labels= data['y']
    data=data.drop('galaxy', axis=1)    
    data=data.drop('y', axis=1)
    
    correlation=abs(data.corrwith(labels))
    columns=correlation.nlargest(cor).index
    data=data[columns]
    
#     imp = IterativeImputer(max_iter=10, random_state=0).fit(data)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(data)
    data=imp.transform(data)

    scaler = StandardScaler().fit(data)
    data = scaler.transform(data)
        
    estimator = GradientBoostingRegressor(n_estimators=300)
    
    cv_results = cross_validate(estimator, data, labels, cv=4, scoring='neg_root_mean_squared_error')

    error=np.mean(cv_results['test_score'])
    
    return error

In [11]:
train_gal=set(train["galaxy"])
train_gal.remove(126)
def loop_train(cor):
    errors=[]
    for gal in train_gal:
        index = train.index[train['galaxy'] == gal]
        data = train.loc[index]
        errors.append(cross_validation_loop(data,cor))
    return np.mean(errors)

In [14]:
cor=[10,15]
errors=[]
for x in cor:
    errors.append(loop_train(x))

In [13]:
print(errors)

[-0.006987424456564654, -0.007024686745992218]


In [15]:
def test_loop(data, test_data):
    labels= data['y']
    data=data.drop('galaxy', axis=1)    
    data=data.drop('y', axis=1)
    correlation=abs(data.corrwith(labels))
    columns=correlation.nlargest(20).index
    
    train_labels = labels
    train_data   = data[columns]
    test_data    = test_data[columns]
    
    imp = IterativeImputer(max_iter=10, random_state=0).fit(train_data)
#     imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(train_data)
    train_data = imp.transform(train_data)
    test_data  = imp.transform(test_data)

    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)

    model = GradientBoostingRegressor(n_estimators=300)
    model.fit(train_data, train_labels)

    predictions = model.predict(test_data)
    return model,predictions

In [16]:
test = test_res
test=test.sort_values(by=['galaxy'])
test_pred = pd.DataFrame(0, index=np.arange(len(test)), columns=["predicted_y"])

In [17]:
i=0
for gal in test_gal:
    count=len(test.loc[test['galaxy'] == gal])
    index = train.index[train['galaxy'] == gal]
    data = train.loc[index]
    pred=test_loop(data,test.loc[test['galaxy'] == gal])
    test_pred.loc[i:i+count-1,'predicted_y'] = pred
    i=i+count 

In [18]:
test["predicted_y"]=test_pred.to_numpy()
test.sort_index(inplace=True)
predictions = test["predicted_y"]

In [19]:
predictions

0      0.043277
1      0.041137
2      0.034454
3      0.040795
4      0.026921
         ...   
885    0.035168
886    0.042361
887    0.075742
888    0.066175
889    0.041541
Name: predicted_y, Length: 890, dtype: float64

In [22]:
index = predictions
pot_inc = -np.log(index+0.01)+3

In [23]:
p2= pot_inc**2

In [24]:
ss = pd.DataFrame({
    'Index':test.index,
    'pred': predictions,
    'opt_pred':0,
    'eei':test['existence expectancy index'], # So we can split into low and high EEI galaxies
})

In [25]:
ss.loc[p2.nlargest(400).index, 'opt_pred']=100
ss=ss.sort_values('pred')
ss.iloc[400:600].opt_pred = 50
ss=ss.sort_index()

In [26]:
increase = (ss['opt_pred']*p2)/1000

In [27]:
print(sum(increase), ss.loc[ss.eei < 0.7, 'opt_pred'].sum(), ss['opt_pred'].sum())

1789.3865314381956 42650 50000


In [28]:
ss[['Index', 'pred', 'opt_pred']].to_csv('submission.csv', index=False)