In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("train.csv", index_col=[0])

In [3]:
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [63]:
df["Weight"] = np.random.rand(len(df), 1)

df["bool1"] = np.random.choice([False, True], size=len(df))
df["bool2"] = np.random.choice([False, True], size=len(df))

In [5]:
def weighted_avg(x, wts):
    return np.average(x, weights=wts, axis=0)

def weighted_std(x, wts):
    average = np.average(x, weights=wts, axis=0)
    xiance = np.average((x - average) ** 2, weights=wts, axis=0)
    return np.sqrt(xiance)

def weighted_skew(x, wts):
    return (np.average((x - weighted_avg(x, wts))**3, weights=wts, axis=0) /
            weighted_avg(x, wts)**(1.5))

def weighted_kurtosis(x, wts):
    return (np.average((x - weighted_avg(x, wts))**4, weights=wts, axis=0) /
            weighted_avg(x, wts)**(2))

In [6]:
funcs = [weighted_avg, weighted_std, weighted_skew, weighted_kurtosis]
metrics = ["LotArea", "LotFrontage", "SalePrice"]

In [7]:
df.dropna(subset=metrics, inplace=True)

In [8]:
def weighted_functions(df, funcs, metrics):
    names = [func.__name__ for func in funcs]
    groups = []
    
    for func in funcs:
        gb = df.groupby(["MSZoning", "bool"]).apply(lambda x: pd.Series(func(x[metrics], x["Weight"]), metrics)).T
        groups.append(gb)
        
    gb = df.groupby(["MSZoning", "SaleCondition"]).apply(lambda x: pd.Series(np.median(x[metrics]), metrics)).T
    groups.append(gb)
    names.append("median")
    
    return pd.concat(groups, axis=1, keys=names).T

In [9]:
a = df.loc[(df["bool"] == True) & (df["MSZoning"] == "RM"), ["SalePrice", "Weight"]]

In [10]:
np.average(a["SalePrice"], weights=a["Weight"])

128736.94712885302

In [11]:
res = weighted_functions(df, funcs, metrics=metrics)
res.loc["weighted_avg"]

Unnamed: 0_level_0,Unnamed: 1_level_0,LotArea,LotFrontage,SalePrice
MSZoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C (all),False,8100.919054,56.880884,46318.651261
C (all),True,9883.50953,77.055986,89484.779008
FV,False,6592.69269,56.633744,206718.193385
FV,True,6626.162145,57.42338,222309.014313
RH,False,6253.804401,60.150555,131622.975111
RH,True,8053.481004,59.447672,117326.624385
RL,False,10793.248264,74.55787,196645.850991
RL,True,10689.474162,74.378966,189496.51027
RM,False,6508.120672,51.118192,121747.32464
RM,True,6900.161137,54.35018,128736.947129


In [27]:
pd.DataFrame(df.loc[df["LotShape"] == "Reg"].groupby(["MSZoning"])["SalePrice"].sum() / df.loc[df["LotShape"] == "Reg", "SalePrice"].sum() * 100)

Unnamed: 0_level_0,SalePrice
MSZoning,Unnamed: 1_level_1
C (all),0.474713
FV,7.112992
RH,1.001867
RL,75.563992
RM,15.846436


In [36]:
df.loc[df["bool"] == True].groupby(["MSZoning"])["SalePrice"].sum() / df.loc[df["bool"] == True]["SalePrice"].sum() * 100

MSZoning
C (all)     0.546263
FV          5.521683
RH          0.572074
RL         83.646452
RM          9.713528
Name: SalePrice, dtype: float64

In [120]:
overall = df.groupby(["bool1"]).apply(lambda x: np.average(x["SalePrice"], weights=x["Weight"]))

f = pd.DataFrame(columns=[False, True])
for zone in df["MSZoning"].unique()[:1]:
    tt = df.loc[~(df["MSZoning"] == zone)].groupby(["bool1"]).apply(lambda x: np.average(x["SalePrice"], weights=x["Weight"]))
    # f.loc[zone] = ((tt - overall) / overall * 100).values
    f.loc[zone] = ((overall - tt) / overall * 100).values

In [124]:
f

# RL makes up 22.65 % of performance

Unnamed: 0,False,True
RL,22.654443,23.178531
