In [1]:
import pandas as pd
import math as m

drop_cols = ['month', 'year', 'app_bundle', 'creative_size', 'day_of_week']

df = pd.read_csv('csvs/2019-04-00.csv', low_memory=False)
df = df.drop(drop_cols, axis=1)

for i in range(1,22):
    # Read daily csv
    temp = pd.read_csv('csvs/2019-04-{}.csv'.format('0'+str(i) if i<10 else str(i)), low_memory=False)
    
    # Drop unnecessary columns
    temp = temp.drop(drop_cols, axis=1)
    
    # Split daily csv into two dataframes: click and no click
    # Note: observations for which clicks = 0 & installs = 1 should actually
    # read clicks = 1 & installs = 1
    click = temp.loc[(temp.clicks == 1) | (temp.installs == 1)]
    click.loc[:, 'clicks'] = 1.0
    no_click = temp[(temp.clicks == 0) & (temp.installs == 0)]
    
    # Downsample no click to be the same size as click
    # For reproducibility, set random_state to 0
    df = pd.concat([
            df, 
            click,
            no_click.sample(n = click.shape[0], random_state = 0)
            ], axis=0)
    
  
    print(i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [2]:
L_category=list(df.category.unique())

In [3]:
L_inv_source=list(df.inventory_source.unique())
L_inv_source

['MOPUB', 'GOOGLE_ADX', 'PUBMATIC', 'RUBICON']

In [4]:
L_platform=list(df.platform_carrier.unique())
L_platform

['Sprint',
 'T-Mobile',
 nan,
 'AT&T',
 'Verizon',
 'C-Spire Wireless',
 'U.S. Cellular',
 'Cellular One',
 'Viaero',
 'West Central',
 '-1',
 'i wireless',
 'Bluegrass Cellular',
 'Appalachian Wireless',
 'Pioneer Cellular',
 'Cincinnati Bell',
 'ETEX Wireless']

In [5]:
L_screen_size=list(df.platform_device_screen_size.unique())
L_screen_size

['XL', 'L', nan, 'UNKNOWN', 'M']

In [6]:
X=df[['inventory_source','platform_carrier','category','platform_device_screen_size','bid_floor', 'rewarded','inventory_interstitial']].values
X

array([['MOPUB', 'Sprint', 'IAB14,IAB9,IAB9_30', ..., 0.00337, 1.0, 1.0],
       ['MOPUB', 'T-Mobile', 'IAB14,IAB9,IAB9_30', ..., 0.00337, 1.0,
        1.0],
       ['MOPUB', 'T-Mobile', 'IAB1,IAB9,IAB9_30', ...,
        0.006540000000000001, 0.0, 1.0],
       ...,
       ['MOPUB', 'T-Mobile', 'IAB13,IAB3', ..., 0.00379, 0.0, 1.0],
       ['MOPUB', 'T-Mobile', 'IAB9,IAB9_30', ..., 0.008239999999999999,
        0.0, 1.0],
       ['MOPUB', nan, 'IAB9,IAB9_30', ..., 0.008409999999999999, 0.0,
        1.0]], dtype=object)

In [7]:
y=df['installs'].values
y

array([0., 0., 0., ..., 0., 0., 0.])

In [8]:
for i in range(len(X)):
    X[i][0]=L_inv_source.index(X[i][0])
    X[i][1]=L_platform.index(X[i][1])   
    X[i][2]=L_category.index(X[i][2])
    X[i][3]=L_screen_size.index(X[i][3])
            
X

array([[0, 0, 0, ..., 0.00337, 1.0, 1.0],
       [0, 1, 0, ..., 0.00337, 1.0, 1.0],
       [0, 1, 1, ..., 0.006540000000000001, 0.0, 1.0],
       ...,
       [0, 1, 30, ..., 0.00379, 0.0, 1.0],
       [0, 1, 2, ..., 0.008239999999999999, 0.0, 1.0],
       [0, 2, 2, ..., 0.008409999999999999, 0.0, 1.0]], dtype=object)

In [9]:
X.shape

(2397462, 7)

In [10]:
test=pd.read_csv("csvs/2019-04-22.csv", low_memory=False)

In [11]:
X_test=test[['inventory_source','platform_carrier','category','platform_device_screen_size','bid_floor', 'rewarded','inventory_interstitial']].values
X_test

array([['MOPUB', 'AT&T', 'IAB1,IAB9,IAB9_23', ...,
        0.00033999999999999997, 0.0, 1.0],
       ['MOPUB', nan, 'IAB1,IAB9,IAB9_23', ..., 0.00253, 0.0, 1.0],
       ['MOPUB', 'Verizon', 'IAB1,IAB9,IAB9_23', ..., 0.00253, 0.0, 1.0],
       ...,
       ['MOPUB', nan, 'IAB1,IAB1_6', ..., 7e-05, 0.0, 0.0],
       ['MOPUB', 'T-Mobile', 'IAB9,IAB9_23,IAB9_30', ...,
        0.0039299999999999995, 0.0, 0.0],
       ['MOPUB', 'T-Mobile', 'IAB17,IAB9,IAB9_30', ..., 0.00079, 0.0,
        0.0]], dtype=object)

In [12]:
y_test=test['installs'].values
y_test

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
for i in range(len(X_test)):
    
    X_test[i][0]=L_inv_source.index(X_test[i][0])
    
    try:
        X_test[i][1]=L_platform.index(X_test[i][1])  
    except ValueError:
        X_test[i][1]=-1
        
    try:
        X_test[i][2]=L_category.index(X_test[i][2])
    except ValueError:
        X_test[i][2]=-1
        
    try:
        X_test[i][3]=L_screen_size.index(X_test[i][3])
    except ValueError:
        X_test[i][3]=-1
            
X_test

array([[0, 3, 43, ..., 0.00033999999999999997, 0.0, 1.0],
       [0, 2, 43, ..., 0.00253, 0.0, 1.0],
       [0, 4, 43, ..., 0.00253, 0.0, 1.0],
       ...,
       [0, 2, 35, ..., 7e-05, 0.0, 0.0],
       [0, 1, 27, ..., 0.0039299999999999995, 0.0, 0.0],
       [0, 1, 39, ..., 0.00079, 0.0, 0.0]], dtype=object)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score

LR=LogisticRegression(solver='lbfgs', class_weight={0:1.2,1:1},C=2, max_iter=1000)
LR.fit(X,y)
y_pred = LR.predict(X_test)
print(classification_report(y_test, y_pred))
f1_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred),LR.score(X,y)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    815378
         1.0       0.00      0.00      0.00        44

   micro avg       1.00      1.00      1.00    815422
   macro avg       0.50      0.50      0.50    815422
weighted avg       1.00      1.00      1.00    815422



  'precision', 'predicted', average, warn_for)


(0.0, 0.5, 0.9993213656775374)

In [15]:
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(random_state=10000)

bc.fit(X, y) 
y2 = bc.predict(X_test)
print(classification_report(y_test, y2))
f1_score(y_test, y2), balanced_accuracy_score(y_test, y2), bc.score(X,y)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    815378
         1.0       0.00      0.00      0.00        44

   micro avg       1.00      1.00      1.00    815422
   macro avg       0.50      0.50      0.50    815422
weighted avg       1.00      1.00      1.00    815422



(0.0, 0.4999993867874777, 0.9993230341085698)

In [16]:
from sklearn import ensemble

GBclf = ensemble.GradientBoostingClassifier(learning_rate=0.001, random_state=10000)
GBclf.fit(X,y)
y3=GBclf.predict(X_test)
print(classification_report(y_test, y3))
f1_score(y_test, y3), balanced_accuracy_score(y_test, y3),GBclf.score(X,y)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    815378
         1.0       0.00      0.00      0.00        44

   micro avg       1.00      1.00      1.00    815422
   macro avg       0.50      0.50      0.50    815422
weighted avg       1.00      1.00      1.00    815422



  'precision', 'predicted', average, warn_for)


(0.0, 0.5, 0.9993213656775374)