In [24]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from scipy.optimize import minimize

import warnings 
warnings.filterwarnings('ignore')

In [25]:
pd.set_option('display.max_columns',None)

# training data
train = pd.read_csv('train.csv')

# test data
test = pd.read_csv('test.csv')
df=pd.concat([train,test], sort=False)
train.head()

Unnamed: 0,galactic year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank",Population using at least basic drinking-water services (%),Population using at least basic sanitation services (%),Gross capital formation (% of GGP),"Population, total (millions)","Population, urban (%)","Mortality rate, under-five (per 1,000 live births)","Mortality rate, infant (per 1,000 live births)",Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64)),"Population, ages 15–64 (millions)","Population, ages 65 and older (millions)","Life expectancy at birth, male (galactic years)","Life expectancy at birth, female (galactic years)","Population, under age 5 (millions)",Young age (0-14) dependency ratio (per 100 creatures ages 15-64),"Adolescent birth rate (births per 1,000 female creatures ages 15-19)",Total unemployment rate (female to male ratio),Vulnerable employment (% of total employment),"Unemployment, total (% of labour force)",Employment in agriculture (% of total employment),Labour force participation rate (% ages 15 and older),"Labour force participation rate (% ages 15 and older), female",Employment in services (% of total employment),"Labour force participation rate (% ages 15 and older), male",Employment to population ratio (% ages 15 and older),Jungle area (% of total land area),"Share of employment in nonagriculture, female (% of total employment in nonagriculture)",Youth unemployment rate (female to male ratio),"Unemployment, youth (% ages 15–24)","Mortality rate, female grown up (per 1,000 people)","Mortality rate, male grown up (per 1,000 people)","Infants lacking immunization, red hot disease (% of one-galactic year-olds)","Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)",Gross galactic product (GGP) per capita,"Gross galactic product (GGP), total","Outer Galaxies direct investment, net inflows (% of GGP)",Exports and imports (% of GGP),Share of seats in senate (% held by female),Natural resource depletion,"Mean years of education, female (galactic years)","Mean years of education, male (galactic years)","Expected years of education, female (galactic years)","Expected years of education, male (galactic years)","Maternal mortality ratio (deaths per 100,000 live births)",Renewable energy consumption (% of total final energy consumption),"Estimated gross galactic income per capita, male","Estimated gross galactic income per capita, female",Rural population with access to electricity (%),Domestic credit provided by financial sector (% of GGP),"Population with at least some secondary education, female (% ages 25 and older)","Population with at least some secondary education, male (% ages 25 and older)",Gross fixed capital formation (% of GGP),"Remittances, inflows (% of GGP)",Population with at least some secondary education (% ages 25 and older),Intergalactic inbound tourists (thousands),"Gross enrolment ratio, primary (% of primary under-age population)","Respiratory disease incidence (per 100,000 people)",Interstellar phone subscriptions (per 100 people),"Interstellar Data Net users, total (% of population)",Current health expenditure (% of GGP),"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
0,990025,Large Magellanic Cloud (LMC),0.628657,63.1252,27109.23431,0.646039,8.240543,,,,,,,38.305483,681.841086,22.131436,150.25929,129.841789,12.403812,490.4464,60.12158,57.028183,69.825369,46.01823,98.216072,174.617899,,,,,79.614066,73.0716,,85.851544,,73.166354,,,,431.270303,466.089287,16.167871,8.223563,21158.324865,7760.639489,13.617993,94.859306,,8.436392,,,,,1237.228737,118.434634,,,,75.604799,,,42.616284,,,,,,,,,,,,,,,,,,0.05259
1,990025,Camelopardalis B,0.818082,81.004994,30166.793958,0.852246,10.671823,4.74247,0.833624,0.467873,152.522198,,,36.255559,423.973089,89.294181,117.069332,61.82259,13.054898,560.547706,58.573002,69.873905,76.752305,40.16796,88.322574,109.892385,,,,,63.81372,45.482071,,93.526359,,35.590371,,,,221.411013,376.475151,36.639519,21.007339,24984.044391,6969.137436,19.48483,77.935108,,12.549351,7.09607,8.098794,11.797213,13.477707,544.458925,53.753525,,,,57.21415,57.314932,56.187355,29.908422,6.225946,44.780023,,120.88608,,,,,,,,,,19.177926,,22.785018,,0.059868
2,990025,Virgo I,0.659443,59.570534,8441.707353,0.499762,8.840316,5.583973,0.46911,0.363837,209.813266,,,18.093991,285.572207,57.491083,226.52376,134.558437,15.963001,555.012461,43.078497,59.462626,59.152506,91.093668,114.163995,230.573666,,,,,80.874149,75.217727,,95.041272,,48.892555,,,,513.730396,581.063598,31.958515,18.954434,19860.12152,3276.634176,13.357158,87.50862,,13.949733,6.494764,6.730145,6.960041,6.805405,1537.953483,126.063691,,,,76.141735,42.405827,53.927715,18.732049,4.138115,24.030945,,96.626831,,,,,,,,,,21.151265,6.53402,,,0.050449
3,990025,UGC 8651 (DDO 181),0.555862,52.333293,,,,,,,,,,,327.202247,47.785456,279.000175,193.228799,12.940991,766.807703,51.318372,50.535797,54.931369,92.727892,97.645232,198.971328,,,,,84.721882,82.026125,,86.307832,,,,,,497.785386,612.106994,,,,,,,,,,,,,2016.655551,,,,,,,,,,,,,,,,,,,,,,,5.912194,,,0.049394
4,990025,Tucana Dwarf,0.991196,81.802464,81033.956906,1.131163,13.800672,13.188907,0.910341,0.918353,71.885345,,,34.497468,633.799718,103.562629,28.359587,51.09825,25.521926,585.434804,56.158054,85.020965,84.708158,71.49381,53.835875,61.008654,,,,,61.086141,66.286848,,76.671844,,,,,,150.173225,213.691418,29.577375,11.134867,75156.049406,8957.403969,,241.901357,,15.09082,11.830216,12.122982,,,232.57314,39.192277,,,134.967049,,77.223935,75.475076,31.398393,,66.674651,,,,,,,,,,,,,5.611753,,,0.154247


In [26]:
df["galaxy"] = df["galaxy"].astype('category')
df["galaxy"] = df["galaxy"].cat.codes
train = df[:3865]
test = df[3865:]
test=test.drop("y", axis = 1)
test_res= test.copy()

In [27]:
train_gal=set(train["galaxy"])
s=0
for x in train_gal:
    s=s+len(train.loc[train['galaxy'] == x])
print("Total distinct galaxies: {}".format(len(train_gal)))
print("Average samples per galaxy: {}".format(s/len(train_gal)))

Total distinct galaxies: 181
Average samples per galaxy: 21.353591160220994


In [28]:
test_gal=set(test["galaxy"])
s=0
for x in test_gal:
    s=s+len(test.loc[test['galaxy'] == x])
print("Total distinct galaxies: {}".format(len(test_gal)))
print("Average samples per galaxy: {}".format(s/len(test_gal)))

Total distinct galaxies: 172
Average samples per galaxy: 5.174418604651163


In [29]:

print("Train vector: " + str(train.shape))
print("Test vector: " + str(test.shape))

Train vector: (3865, 80)
Test vector: (890, 79)


In [30]:
def cross_validation_loop(data,cor):
    labels= data['y']
    data=data.drop('galaxy', axis=1)    
    data=data.drop('y', axis=1)
    
    correlation=abs(data.corrwith(labels))
    columns=correlation.nlargest(cor).index
    data=data[columns]
    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(data)
    data=imp.transform(data)

    scaler = StandardScaler().fit(data)
    data = scaler.transform(data)
        
    estimator = GradientBoostingRegressor(n_estimators=300)
    
    cv_results = cross_validate(estimator, data, labels, cv=4, scoring='neg_root_mean_squared_error')

    error=np.mean(cv_results['test_score'])
    
    return error

In [31]:
train_gal=set(train["galaxy"])
train_gal.remove(126)
def loop_train(cor):
    errors=[]
    for gal in train_gal:
        index = train.index[train['galaxy'] == gal]
        data = train.loc[index]
        errors.append(cross_validation_loop(data,cor))
    return np.mean(errors)

In [32]:
cor=[20,25,30,40,50,60,70,80]
errors=[]
for x in cor:
    errors.append(loop_train(x))

KeyboardInterrupt: 

In [23]:
print(errors)

[-0.006349489123870391, -0.006445235762609366, -0.006465493077998745, -0.006611827434541564, -0.006762339946789965, -0.006923052053205841, -0.0070710764633024415, -0.007027285851313923]


In [11]:
def test_loop(data, test_data):
    labels= data['y']
    data=data.drop('galaxy', axis=1)    
    data=data.drop('y', axis=1)
    correlation=abs(data.corrwith(labels))
    columns=correlation.nlargest(20).index
    
    train_labels= labels
    train_data=data[columns]
    test_data= test_data[columns]
    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(train_data)
    train_data=imp.transform(train_data)
    test_data=imp.transform(test_data)

    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)

    model = GradientBoostingRegressor(n_estimators=300)
    model.fit(train_data, train_labels)

    predictions = model.predict(test_data)
    return predictions

In [12]:
test=test_res
test=test.sort_values(by=['galaxy'])
test_pred = pd.DataFrame(0, index=np.arange(len(test)), columns=["predicted_y"])

In [13]:
i=0
for gal in test_gal:
    count=len(test.loc[test['galaxy'] == gal])
    index = train.index[train['galaxy'] == gal]
    data = train.loc[index]
    pred=test_loop(data,test.loc[test['galaxy']==gal])
    test_pred.loc[i:i+count-1,'predicted_y'] = pred
    i=i+count

In [14]:
test["predicted_y"]=test_pred.to_numpy()
test.sort_index(inplace=True)
predictions = test["predicted_y"]

In [15]:
index = predictions
pot_inc = -np.log(index+0.01)+3

In [16]:
p2= pot_inc**2

In [17]:
ss = pd.DataFrame({
    'Index':test.index,
    'pred': predictions,
    'opt_pred':0,
    'eei':test['existence expectancy index'], # So we can split into low and high EEI galaxies
})

In [18]:
ss.loc[p2.nlargest(400).index, 'opt_pred']=100
ss=ss.sort_values('pred')
ss.iloc[400:600].opt_pred = 50
ss=ss.sort_index()

In [19]:
increase = (ss['opt_pred']*p2)/1000

In [20]:
print(sum(increase), ss.loc[ss.eei < 0.7, 'opt_pred'].sum(), ss['opt_pred'].sum())

1789.7121397915878 6500 50000


In [21]:

ss[['Index', 'pred', 'opt_pred']].to_csv('submission.csv', index=False)