In [60]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [86]:
df = pd.read_csv('train.csv')

In [87]:
test = pd.read_csv('test.csv')

In [88]:
df.shape

(30000, 30)

In [89]:
test.shape

(18000, 29)

In [90]:
df.head(5)

Unnamed: 0,apartment_id,building_age_years,wing,entrance,floor,apartment_area_m2,ceiling_height_m,window_quality,heating_type,corner_apartment,...,stand_type,cut_days_before_jan1,potted_tree,waterings_per_week,mist_spray,tinsel_level,ornaments_weight_kg,led_garland,garland_hours_per_day,survived_to_18jan
0,apt_train_000001,27,east,6,16,58.7,2.73,new,central,1,...,bucket,10,0,1.0,1,high,4.53,0,11.3,1
1,apt_train_000002,27,west,9,20,61.9,2.72,normal,electric_heater,1,...,bucket,12,0,1.0,1,low,,0,2.9,0
2,apt_train_000003,27,north,2,9,48.0,2.48,normal,central,0,...,simple_stand,0,0,2.0,0,medium,3.17,1,15.2,0
3,apt_train_000004,27,west,11,14,32.1,2.54,normal,central,1,...,water_reservoir,22,0,6.0,1,medium,2.9,0,4.6,0
4,apt_train_000005,27,south,3,20,57.9,2.73,old,electric_heater,1,...,water_reservoir,2,0,4.0,1,medium,2.38,1,0.0,1


In [91]:
df.dtypes

Unnamed: 0,0
apartment_id,object
building_age_years,int64
wing,object
entrance,int64
floor,int64
apartment_area_m2,float64
ceiling_height_m,float64
window_quality,object
heating_type,object
corner_apartment,int64


In [92]:
cat_cols = ['apartment_id', 'wing', 'window_quality', 'heating_type', 'tree_species', 'tree_form', 'stand_type', 'tinsel_level']

In [93]:
df.drop(columns=cat_cols, inplace=True)
df.dtypes

Unnamed: 0,0
building_age_years,int64
entrance,int64
floor,int64
apartment_area_m2,float64
ceiling_height_m,float64
corner_apartment,int64
room_temp_c,float64
window_ventilation_per_day,int64
humidity_pct,float64
radiator_distance_m,float64


In [94]:
test.drop(columns=cat_cols, inplace=True)

In [95]:
df.isna().sum()

Unnamed: 0,0
building_age_years,0
entrance,0
floor,0
apartment_area_m2,0
ceiling_height_m,892
corner_apartment,0
room_temp_c,0
window_ventilation_per_day,0
humidity_pct,2176
radiator_distance_m,1225


In [97]:
for c in df.columns:
  if df[c].isna().sum():
    df[c][df[c].isna()] = df[c].mean()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[c][df[c].isna()] = df[c].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c][df[c].isna()] = df[c].

In [99]:
df.dropna(inplace=True)
df.shape

(30000, 22)

In [100]:
for c in test.columns:
  if test[c].isna().sum():
    test[c][test[c].isna()] = test[c].mean()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  test[c][test[c].isna()] = test[c].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[c][test[c].isna()

In [101]:
test.dropna(inplace=True)
test.shape

(18000, 21)

In [102]:
X = df.drop(columns=['survived_to_18jan', 'building_age_years'])
y = df['survived_to_18jan']

In [103]:
test = test.drop(columns=['building_age_years'])

In [104]:
X = X.astype("float64")
test = test.astype("float64")

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [106]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [107]:
model = LogisticRegression(solver="liblinear")

In [108]:
model.fit(X_train_scaled, y_train)

In [109]:
y_proba = model.predict_proba(X_test_scaled)[:, 1]

In [110]:
roc_auc = roc_auc_score(y_test, y_proba)
roc_auc

np.float64(0.6583404826106893)

In [111]:
test_scaled = scaler.transform(test)

In [112]:
submission = pd.read_csv("/content/sample_submission.csv")

In [113]:
submission

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.5
1,apt_000002,0.5
2,apt_000003,0.5
3,apt_000004,0.5
4,apt_000005,0.5
...,...,...
17995,apt_017996,0.5
17996,apt_017997,0.5
17997,apt_017998,0.5
17998,apt_017999,0.5


In [114]:
y_submission = model.predict_proba(test_scaled)[:, 1]

In [122]:
submission['survived_to_18jan'] = y_submission

In [123]:
submission

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.419434
1,apt_000002,0.380126
2,apt_000003,0.425557
3,apt_000004,0.298942
4,apt_000005,0.350037
...,...,...
17995,apt_017996,0.594237
17996,apt_017997,0.295293
17997,apt_017998,0.435973
17998,apt_017999,0.642827


In [121]:
submission.to_csv("/content/baseline_submission.csv", index=False)