<a href="https://colab.research.google.com/github/MathewLipman/Work-Samples/blob/main/JB_Data_Chain_Method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [3]:
import catboost as cb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
import collections

In [4]:
def prep_for_ml(df):
  # remove pandas types
  return (df
    .assign(**{col:df[col].astype(float)
               for col in df.select_dtypes('number')},
            **{col:df[col].astype(str).fillna('')
              for col in df.select_dtypes(['object', 'category'])})
  )

In [5]:
def predict_col(df, col):
  df = prep_for_ml(df)
  missing = df.query(f'~{col}.isna()')
  cat_idx = [i for i,typ in enumerate(df.drop(columns=[col]).dtypes)
             if str(typ) == 'object']
  X = (missing
      .drop(columns=[col])
      .values
       )
  y = missing[col]
  model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
  model.fit(X,y, cat_features=cat_idx)
  pred = model.predict(df.drop(columns=[col]))
  return df[col].where(~df[col].isna(), pred)

In [8]:
def get_uniq_cols(jb):
    counter = collections.defaultdict(list)
    for col in sorted(jb.columns):
        period_count = col.count('.')
        if period_count >= 2:
            part_end = 2
        else:
            part_end = 1
        parts = col.split('.')[:part_end]
        counter['.'.join(parts)].append(col)
    uniq_cols = []
    for cols in counter.values():
        if len(cols) == 1:
          uniq_cols.extend(cols)
    return uniq_cols

In [9]:
def tweak_jb(jb):
  uniq_cols = get_uniq_cols(jb)
  return (jb
    [uniq_cols]
    .rename(columns=lambda c: c.replace('.','_'))
    .assign(age = lambda df_: df_.age
        .str.slice(0,2)
        .astype('float')
        .astype('Int64'),
            are_you_datascientist=lambda df_: df_.are_you_datascientist
            .replace({'Yes':True, 'No':False, np.nan:False}),
            company_size=lambda df_:df_.company_size
            .replace({'Just me': 1, 'Not sure': np.nan, 'More than 5,000': 5000,
                      '2–10': 2, '11–50':11,'51–500': 51,
                      '501–1,000':501, '1,001–5,000':1001})
            .astype('Int64').astype,
            country_live =lambda df_: df_.country_live.astype('category'),
            employment_status=lambda df_: df_.employment_status.fillna('Other')
            .astype('category'),
            is_python_main=lambda df_: df_.is_python_main.astype('category'),
            team_size=lambda df_: df_.team_size.str.split('-', n=0)
            .str[0].replace('More than 40 people', 41)
            .where(df_.company_size!=1, 1).astype(float),
            years_of_coding = lambda df_: df_.years_of_coding
            .replace('Less than 1 year', .5).str.extract(r'(\d+)')
            .astype('float'),
            python_years=lambda df_: df_.python_years.replace('Less than 1 year',.5)
            .str.extract(r'(\d+)').astype('float'),
            use_python_most=lambda df_: df_.use_python_most.fillna('Unknown')
            )
      .assign(team_size=lambda df_:predict_col(df_, 'team_size')
              .astype(int))
      .drop(columns=['python2_version_most'])
      .dropna()
    )

url = 'https://github.com/mattharrison/datasets/raw/master/data/2020-jetbrains-python-survey.csv'
jb = pd.read_csv(url)
jb2 = tweak_jb(jb)

  jb = pd.read_csv(url)


Learning rate set to 0.5
0:	learn: 6.3803115	total: 73.3ms	remaining: 1.39s
1:	learn: 6.3357433	total: 91.8ms	remaining: 826ms
2:	learn: 6.2698171	total: 106ms	remaining: 600ms
3:	learn: 6.2389428	total: 119ms	remaining: 476ms
4:	learn: 6.2255307	total: 131ms	remaining: 393ms
5:	learn: 6.2151646	total: 143ms	remaining: 333ms
6:	learn: 6.1989072	total: 157ms	remaining: 291ms
7:	learn: 6.1814612	total: 167ms	remaining: 251ms
8:	learn: 6.1779375	total: 178ms	remaining: 218ms
9:	learn: 6.1707748	total: 189ms	remaining: 189ms
10:	learn: 6.1702599	total: 201ms	remaining: 164ms
11:	learn: 6.1648639	total: 211ms	remaining: 141ms
12:	learn: 6.1534347	total: 222ms	remaining: 120ms
13:	learn: 6.1495147	total: 231ms	remaining: 99ms
14:	learn: 6.1364829	total: 243ms	remaining: 81ms
15:	learn: 6.1339867	total: 253ms	remaining: 63.3ms
16:	learn: 6.1230472	total: 265ms	remaining: 46.9ms
17:	learn: 6.1112328	total: 283ms	remaining: 31.5ms
18:	learn: 6.1102879	total: 295ms	remaining: 15.5ms
19:	learn: 6

Unnamed: 0,age,nps_main_ide,python_years,team_size,years_of_coding
count,16094.0,16094.0,16094.0,16094.0,16094.0
mean,29.253635,8.862682,4.61613,4.156953,5.097552
std,9.664401,1.535643,3.185911,4.611627,3.809978
min,18.0,0.0,1.0,1.0,1.0
25%,21.0,8.0,3.0,2.0,1.0
50%,30.0,9.0,3.0,3.0,3.0
75%,30.0,10.0,6.0,4.0,11.0
max,60.0,10.0,11.0,41.0,11.0


In [14]:
jb.memory_usage(deep=True).sum()

529602448

from 529 megs to 13 megs

In [15]:
jb2.memory_usage(deep=True).sum()

13892498