In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import BayesianRidge,SGDClassifier,Ridge,LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn import svm
from sklearn.svm import SVC,SVR
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Flatten
from keras.optimizers import Adam
from keras.utils import to_categorical
from math import sqrt
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error

import time

In [38]:
def read_data(path='sample_users_100k.csv'):
    df = pd.read_csv(path,sep='\t')
    df['botscore']=pd.to_numeric(df['botscore'],errors='coerce')
    return df

In [39]:
def select_fearture(df1):
    df = df1[pd.notna(df1['botscore'])]
    df = df[df['botscore'] >= 0]
    df.loc[df.botscore > 0.5, 'isbot'] = 1
    df.loc[df.botscore <= 0.5, 'isbot'] = 0
    feature = df[['friendsCount','followersCount','statusesCount','tweetsCount','retweetsCount','listedCount','favoritesCount','influence_percentile','isbot']]
    
    df['psi'] = df['followersCount']/(df['friendsCount']+1)
    df['tr'] = df['tweetsCount']/(df['retweetsCount']+1)
    feature = df[['friendsCount','followersCount','statusesCount','tweetsCount','retweetsCount','listedCount','favoritesCount','influence_percentile','psi','tr','isbot']]
    return df, feature

In [40]:
def select_feature_R(df1):
    df = df1[pd.notna(df1['botscore'])]
    df = df[df['botscore'] >= 0]
    df['psi'] = df['followersCount']/(df['friendsCount']+1)
    df['tr'] = df['tweetsCount']/(df['retweetsCount']+1)
    feature = df[['psi','statusesCount','tr','retweetsCount','tweetsCount','listedCount','favoritesCount','influence_percentile','botscore']]
    return df, feature

In [41]:
def group_data(df):
    isbot = df[df['isbot'] == 1]
    nonbot = df[df['isbot'] == 0]
    return isbot, nonbot

In [42]:
def oversampling(df, copy_num):
    isbot, nonbot = group_data(df)
    isbot_copy = isbot
    isbot.index = range(len(isbot))
    l = len(isbot)
    print("before oversampling:" + str(np.shape(df)))
    for i in range(copy_num):
        isbot = pd.concat([isbot, isbot_copy])
    
    feature = pd.concat([isbot, nonbot])
    feature = utils.shuffle(feature)
    
    print("after oversampling:" + str(np.shape(feature)))
    return feature

In [43]:
def undersampling(df, drop_prop):
    print("before undersampling:" + str(np.shape(df)))
    isbot, notbot = group_data(df)
    utils.shuffle(notbot)
    notbot.index = range(len(notbot))
    notbot = notbot.drop(range(int(len(notbot) * drop_prop)))
    
    feature = pd.concat([isbot, notbot])
    feature = utils.shuffle(feature)
    
    print("after undersampling:" + str(np.shape(feature)))
    return feature

In [44]:
def preprocess(feature, is_oversampling = False, is_undersampling = False, is_smote = True):
    
    feature['isbot'] = pd.to_numeric(feature['isbot'], downcast='integer')
    feature[feature.columns[:-1]] = feature[feature.columns[:-1]].fillna(feature[feature.columns[:-1]].mean())
    
    if is_oversampling:
        feature = oversampling(feature, 3)
    if is_undersampling:
        feature = undersampling(feature, 0.5)
    
    y = np.asarray(feature.isbot) 
    x = np.asarray(feature[feature.columns[:-1]])
    
    if is_smote:
        print("data amount before smote:" + str(len(y)))
        smote = SMOTE()
        x, y = smote.fit_sample(x, y)
        print("data amount after smote:" + str(len(y)))
    
    return x, y

In [45]:
def preprocess_R(feature):
    feature[feature.columns[:-1]] = feature[feature.columns[:-1]].fillna(feature[feature.columns[:-1]].mean())
    y = np.asarray(feature.botscore)
    x = np.asarray(feature[feature.columns[:-1]])
    
    return x, y

In [46]:
def evaluate(y_test, y_predict, argmax=False):
    if argmax:
        y_test = to_categorical(y_test)
        matrix = metrics.confusion_matrix(y_test.argmax(axis=1), y_predict.argmax(axis=1))
    else:
        matrix = metrics.confusion_matrix(y_test, y_predict)
    
    a = matrix[0][0] / matrix[0].sum()
    b = matrix[1][1] / matrix[1].sum()
    balanced_acc = (a + b) / 2
    
    print("balanced_acc:\t %.4f\t" %(balanced_acc))
    print("confusion matrix:")
    print(str(matrix))
    
    return matrix, balanced_acc

In [47]:
def build_tree(x, y, is_bagging=False, is_random_froest=False, is_svm=False, is_Ada=False):
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
    x_train = x
    y_train = y
    model = None
    tree = DecisionTreeClassifier(criterion='entropy')
    
    if is_bagging:
        bag = BaggingClassifier(base_estimator=tree,n_estimators=500,max_samples=1.0,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=1,random_state=1)
        bag.fit(x_train, y_train)
#         y_trained = bag.predict(x_train)
#         y_tested = bag.predict(x_test)
        model = bag
    
    elif is_random_froest:
        random_forest = RandomForestClassifier(n_estimators=500)
        random_forest.fit(x_train, y_train)
#         y_trained = random_forest.predict(x_train)
#         y_tested = random_forest.predict(x_test)
        model = random_forest
        
    elif is_svm:
        svm_model = svm.SVC()
        svm_model.fit(x_train, y_train)
#         y_trained = svm_model.predict(x_train)
#         y_tested = svm_model.predict(x_test)
        model = svm_model
    
    elif is_Ada:
        ada = AdaBoostClassifier(base_estimator=tree, learning_rate=0.1, n_estimators=500)
        ada.fit(x_train, y_train)
#         y_trained = ada.predict(x_train)
#         y_tested = ada.predict(x_test)
        model = ada
        
    else:
        tree.fit(x_train, y_train)
#         y_trained = tree.predict(x_train)
#         y_tested = tree.predict(x_test)
        model = tree
    
#     tree_train=accuracy_score(y_train, y_trained)
#     tree_test=accuracy_score(y_test, y_tested)
        
#     print("\n******** fit completed ********\n")
#     print("training acc:\t %.4f\t" %(tree_train))
#     print("test acc:\t %.4f\t" %(tree_test))
    
    
#     matrix, balanced_acc = evaluate(y_test, y_tested)
    return model    

In [48]:
def nn(x, y):
    y = to_categorical(y)
    
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
    x_train = x
    y_train = y
#     print("training data: %d" %(len(y_train)))
#     print("test data: %d" %(len(y_test)))
    
    model = Sequential()
    model.add(Dense(64, input_dim=10, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(64, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(128, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(128, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(256, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(256, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(64, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(64, init='uniform', activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(2, init='uniform', activation='softmax'))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    
    history = model.fit(x_train, y_train, batch_size=1024, epochs=200)
    return model, history

In [49]:
def regression(x,y,model):
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1)
    x_train = x
    y_train = y
#     print("training data: %d" %(len(y_train)))
#     print("test data: %d" %(len(y_test)))
    if model == 'Bayes':
        clf = BayesianRidge(n_iter=1000,normalize=True)
    if model == 'RF':
        clf = RandomForestRegressor(max_depth=8,max_features=4)
    if model == 'SVR':
        clf = SVR(C=0.01,degree=4)
    if model == 'gdbt':
        clf = GradientBoostingRegressor(loss='ls',n_estimators=600,learning_rate=0.05,max_depth=8)
    if model == 'tree':
        clf = DecisionTreeRegressor(max_depth=8,max_features=6)
    if model == 'DNN':
        clf = Sequential()
        
        clf.add(Dense(128, kernel_initializer='normal',input_dim = x_train.shape[1], activation='relu'))

        clf.add(Dense(256,activation='relu'))
        clf.add(Dense(256,activation='relu'))
        clf.add(Dense(256,activation='relu'))
        clf.add(Dense(256,activation='relu'))
        clf.add(Dense(256,activation='relu'))

        clf.add(Dense(1, kernel_initializer='normal',activation='linear'))

        clf.compile(loss='mae', optimizer='adam', metrics=['mse'])
        clf.summary()
        clf.fit(x_train, y_train, epochs=50, batch_size=32, validation_split = 0.2)
    if model != 'DNN':
        clf.fit(x_train, y_train)
        
#     y_trained = clf.predict(x_train)
#     y_tested = clf.predict(x_test)
    
#     svc_train= sqrt(mean_squared_error(y_train, y_trained))
#     svc_test = sqrt(mean_squared_error(y_test, y_tested))
    
#     print("\n******** fit completed ********\n")
#     print("training rmse:\t %.4f\t" %(svc_train))
#     print("test rmse:\t %.4f\t" %(svc_test))

    return clf    

In [50]:
start = time.time()

df = read_data()
df_c, feature = select_fearture(df)
df_r,feature_r = select_feature_R(df) 

xc, yc = preprocess(feature, is_smote=True)
xr,yr = preprocess_R(feature_r)

model, history = nn(xc, yc)
# tree = build_tree(xc, yc, is_bagging=True)
regression = regression(xr,yr,'gdbt')

end = time.time()
print("\n******* running time: %.3f s *******" %(end - start))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


data amount before smote:88143
data amount after smote:169474


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  from ipykernel import kernelapp as app


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200

******* running time: 1629.507 s *******


In [51]:
y_test = model.predict(xc)

In [52]:
cr =[]
for i in c_result:
    cr.append(np.argmax(i))

In [53]:
def read_data_result(path='testing_set_features.csv'):
    df = pd.read_csv(path,sep='\t')
    return df

In [54]:
res = read_data_result()

In [55]:
res['psi'] = res['followersCount']/(res['friendsCount']+1)
res['tr'] = res['tweetsCount']/(res['retweetsCount']+1)
test_c = res[['user_id','friendsCount','followersCount','statusesCount','tweetsCount','retweetsCount','listedCount','favoritesCount','influence_percentile','psi','tr']]
test_c_f = test_c[test_c.columns[1:]]
userId = test_c[test_c.columns[:1]]
test_r = res[['psi','statusesCount','tr','retweetsCount','tweetsCount','listedCount','favoritesCount','influence_percentile']]

In [56]:
test_c_f = test_c_f.fillna(test_c_f.mean())
x_c = np.asarray(test_c_f)

In [57]:
test_r = test_r.fillna(test_r.mean())
x_r = np.asarray(test_r)

In [58]:
c_result = model.predict(x_c)
r_result = regression.predict(x_r)

In [59]:
cr =[]
for i in c_result:
    cr.append(np.argmax(i))

In [60]:
output = pd.DataFrame(data=userId)
output['botscore'] = r_result
output['is_bot'] = cr

In [61]:
output.loc[output.isnull().any(axis=1), :] = np.nan
output.to_csv('result.csv',sep='\t')