https://www.kaggle.com/datasets/pwang001/user-package-information-of-mobile-operators

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
#!wget https://www.dropbox.com/s/5b6o99k5ko6cr1r/train_1.csv
#!wget https://www.dropbox.com/s/br8m56fomdxs7lw/train_2.csv
#!wget https://www.dropbox.com/s/xmw5shslewwfi0n/test_1.csv
#!wget https://www.dropbox.com/s/uxxc48au1zclpu2/test_2.csv
df_train_1 = pd.read_csv('train_1.csv', encoding='utf-8')
df_train_2 = pd.read_csv('train_2.csv', encoding='utf-8')
df_test_1 = pd.read_csv('test_1.csv', encoding='utf-8')
df_test_2 = pd.read_csv('test_2.csv', encoding='utf-8')
df = pd.concat([df_train_1, df_train_2, df_test_1, df_test_2])
df_old = df

In [3]:
df.isnull().any()

service_type              False
is_mix_service            False
online_time               False
1_total_fee               False
2_total_fee               False
3_total_fee               False
4_total_fee               False
month_traffic             False
many_over_bill            False
contract_type             False
contract_time             False
is_promise_low_consume    False
net_service               False
pay_times                 False
pay_num                   False
last_month_traffic        False
local_trafffic_month      False
local_caller_time         False
service1_caller_time      False
service2_caller_time      False
gender                    False
age                       False
complaint_level           False
former_complaint_num      False
former_complaint_fee      False
current_service            True
user_id                   False
dtype: bool

In [4]:
allCs = df['current_service'].size
nullCs = df['current_service'][df['current_service'].isnull()].size
print('current_service count', allCs, sep=': ') 
print('current_service nulls count', nullCs, sep=': ')
print('percent of nulls', nullCs/allCs*100, sep=': ')

current_service count: 1479211
current_service nulls count: 360566
percent of nulls: 24.37556237751071


In [5]:
df['current_service'].unique()

array([99999825., 90063345., 90109916., 89950166., 89950168., 89950167.,
       90155946., 99999828., 99999826., 99999827., 99999830.,   999999.,
             nan])

In [6]:
del df["current_service"]
df.isnull().any()

service_type              False
is_mix_service            False
online_time               False
1_total_fee               False
2_total_fee               False
3_total_fee               False
4_total_fee               False
month_traffic             False
many_over_bill            False
contract_type             False
contract_time             False
is_promise_low_consume    False
net_service               False
pay_times                 False
pay_num                   False
last_month_traffic        False
local_trafffic_month      False
local_caller_time         False
service1_caller_time      False
service2_caller_time      False
gender                    False
age                       False
complaint_level           False
former_complaint_num      False
former_complaint_fee      False
user_id                   False
dtype: bool

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1479211 entries, 0 to 160565
Data columns (total 26 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   service_type            1479211 non-null  int64  
 1   is_mix_service          1479211 non-null  int64  
 2   online_time             1479211 non-null  int64  
 3   1_total_fee             1479211 non-null  float64
 4   2_total_fee             1479211 non-null  object 
 5   3_total_fee             1479211 non-null  object 
 6   4_total_fee             1479211 non-null  float64
 7   month_traffic           1479211 non-null  float64
 8   many_over_bill          1479211 non-null  int64  
 9   contract_type           1479211 non-null  int64  
 10  contract_time           1479211 non-null  int64  
 11  is_promise_low_consume  1479211 non-null  int64  
 12  net_service             1479211 non-null  int64  
 13  pay_times               1479211 non-null  int64  
 14  pay

In [8]:
# user_id ничего не определяет при классификации
del df["user_id"]

In [9]:
tf2 = pd.to_numeric(df['2_total_fee'], errors='coerce')

allCs = tf2.size
nullCs = tf2[tf2.isnull()].size
print('2_total_fee count', allCs, sep=': ') 
print('2_total_fee nulls count', nullCs, sep=': ')
print('percent of nulls', nullCs/allCs*100, sep=': ')

2_total_fee count: 1479211
2_total_fee nulls count: 15
percent of nulls: 0.0010140541139837386


In [10]:
df['2_total_fee'] = tf2
df = df[pd.notnull(df['2_total_fee'])]

In [11]:
tf3 = pd.to_numeric(df['3_total_fee'], errors='coerce')

allCs = tf3.size
nullCs = tf3[tf3.isnull()].size
print('3_total_fee count', allCs, sep=': ') 
print('3_total_fee nulls count', nullCs, sep=': ')
print('percent of nulls', nullCs/allCs*100, sep=': ')

3_total_fee count: 1479196
3_total_fee nulls count: 5
percent of nulls: 0.0003380214657151588


In [12]:
df['3_total_fee'] = tf3
df = df[pd.notnull(df['3_total_fee'])]

In [13]:
age = pd.to_numeric(df['age'], errors='coerce')

allCs = age.size
nullCs = age[age.isnull()].size
print('age count', allCs, sep=': ') 
print('age nulls count', nullCs, sep=': ')
print('percent of nulls', nullCs/allCs*100, sep=': ')

age count: 1479191
age nulls count: 2
percent of nulls: 0.00013520904332165354


In [14]:
df['age'] = age
df = df[pd.notnull(df['age'])]

In [15]:
gender = pd.to_numeric(df['gender'], errors='coerce')

allCs = gender.size
nullCs = gender[gender.isnull()].size
zeroCs = gender[gender == 0].size
print('gender count', allCs, sep=': ') 
print('gender nulls count', nullCs, sep=': ')
print('percent of nulls', nullCs/allCs*100, sep=': ')
print('gender zeros count', zeroCs, sep=': ')
print('percent of nulls', zeroCs/allCs*100, sep=': ')

gender count: 1479189
gender nulls count: 0
percent of nulls: 0.0
gender zeros count: 28785
percent of nulls: 1.9459987871732416


In [16]:
df['gender'] = gender
df = df[pd.notnull(df['gender'])]
df = df[df['gender'] != 0]

In [17]:
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450404 entries, 0 to 1450403
Data columns (total 25 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   service_type            1450404 non-null  int64  
 1   is_mix_service          1450404 non-null  int64  
 2   online_time             1450404 non-null  int64  
 3   1_total_fee             1450404 non-null  float64
 4   2_total_fee             1450404 non-null  float64
 5   3_total_fee             1450404 non-null  float64
 6   4_total_fee             1450404 non-null  float64
 7   month_traffic           1450404 non-null  float64
 8   many_over_bill          1450404 non-null  int64  
 9   contract_type           1450404 non-null  int64  
 10  contract_time           1450404 non-null  int64  
 11  is_promise_low_consume  1450404 non-null  int64  
 12  net_service             1450404 non-null  int64  
 13  pay_times               1450404 non-null  int64  
 14  pa

In [18]:
# размеры изначального и обработанного фреймов
print(df_old.shape[0])
print(df.shape[0])
print('rows removed', df_old.shape[0] - df.shape[0], sep=': ')
print('in percents (%)', (df_old.shape[0] - df.shape[0]) / df_old.shape[0] * 100, sep=': ')

1479211
1450404
rows removed: 28807
in percents (%): 1.9474571241019707


In [19]:
x = df.loc[:, df.columns != 'gender']
y = df['gender']
train_points, test_points, train_values, test_values = train_test_split(x, y, random_state=104, test_size=0.25, shuffle=False, stratify=None)

In [20]:
rf_model = ensemble.RandomForestClassifier(n_estimators=100)
rf_model.fit(train_points, train_values)
test_predict_rf = rf_model.predict(test_points)
print(accuracy_score(test_values, test_predict_rf) * 100)

88.34724669816134


In [21]:
filt = df
filt = filt[filt['online_time'] <= 153.2]
filt = filt[filt['1_total_fee'] <= 407.04]
filt = filt[filt['2_total_fee'] <= 466.53]
filt = filt[filt['3_total_fee'] <= 414.89]
filt = filt[filt['4_total_fee'] <= 413.28]
filt = filt[filt['month_traffic'] <= 14303.72]
filt = filt[filt['pay_times'] <= 9.72]
filt = filt[filt['pay_num'] <= 3071.01]
filt = filt[filt['last_month_traffic'] <= 4177.35]
filt = filt[filt['local_trafffic_month'] <= 61760.91]
filt = filt[filt['local_caller_time'] <= 662.78]
filt = filt[filt['service1_caller_time'] <= 407.45]
filt = filt[filt['service2_caller_time'] <= 995.47]
filt = filt[filt['age'] <= 70.04]
filt = filt[filt['age'] > 0]
filt = filt[filt['former_complaint_num'] <= 2.4]
filt = filt[filt['former_complaint_fee'] <= 8218.7]
filt = filt.reset_index(drop=True)

In [22]:
# размеры изначального и обработанного фреймов
print(df_old.shape[0])
print(filt.shape[0])
print('rows removed', df_old.shape[0] - filt.shape[0], sep=': ')
print('in percents (%)', (df_old.shape[0] - filt.shape[0]) / df_old.shape[0] * 100, sep=': ')

1479211
1324527
rows removed: 154684
in percents (%): 10.457196437830708


In [23]:
x = filt.loc[:, filt.columns != 'gender']
y = filt['gender']
train_points_f, test_points_f, train_values_f, test_values_f = train_test_split(x, y, random_state=104, test_size=0.25, shuffle=False, stratify=None)

In [24]:
rf_model = ensemble.RandomForestClassifier(n_estimators=100)
rf_model.fit(train_points_f, train_values_f)
test_predict_rf = rf_model.predict(test_points_f)
print(accuracy_score(test_values_f, test_predict_rf) * 100)

88.39073239674813
