In [238]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier


In [239]:
# Load the dataset
data = pd.read_csv('Doceree-HCP_Train.csv', encoding='latin1')

In [254]:

# Separate the features and target variable
X = data.drop(['IS_HCP', 'TAXONOMY'], axis=1)
y = data['IS_HCP']

In [255]:
data.head()

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS,TAXONOMY,IS_HCP
0,1001,Desktop,2,170.173.0.22,6974dcaa-f932-480e-9fb5-c52e20e1393a,Portland,97206.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/canc...,General|Endocrine|False|Medicine|Surgery|Urolo...,,0.0
1,1002,Desktop,2,65.216.253.25,c12f3f8f-8fcf-484a-90e1-1ac04db8cdcf,Arlington,22202.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/deci...,Bone Marrow|Radiography|Chronic|Oncology|Psych...,,0.0
2,1003,Desktop,2,66.232.79.22,a698de4b-e200-46dd-b5fb-40402175ae18,New Meadows,83654.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cancertherapyadvisor.com/home/canc...,General|Endocrine|False|Medicine|Surgery|Urolo...,,0.0
3,1004,Desktop,3,137.54.125.246,45967533-75c8-4fbd-a00c-e6ff20447aaa,,229114624.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://globalrph.com/medcalcs/warfarin-mainte...,Dental|Total|Clinical|Pharmacology|Physicians|...,2084P0800X,1.0
4,1005,Mobile,7,174.202.231.99,a17e25be-532d-4cf5-b916-9308c8c3961f,Houston,77008.0,Mozilla/5.0 (iPhone; CPU iPhone OS 16_1_1 like...,Online Medical Journal,Website,https://www.cureus.com/articles/58184-a-review...,Critical Care|Emergency Medicine|General Pract...,,0.0


In [257]:
# Preprocess categorical columns
categorical_columns = ['DEVICETYPE', 'PLATFORM_ID', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
X_categorical = X[categorical_columns].astype(str)

In [258]:
# Apply feature hashing to categorical columns
hasher = FeatureHasher(n_features=1000, input_type='string')
hashed_features = hasher.transform(X_categorical.values)

In [259]:
# Text-based features: Concatenate and transform using CountVectorizer
text_features = X['USERZIPCODE'].astype(str) + ' ' + X['USERAGENT']
text_features = text_features.fillna('')  # Replace NaN values with an empty string
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(text_features)

In [260]:
# Concatenate all features
X_processed = hstack((hashed_features, X_text))

In [261]:
# Preprocess string columns
string_columns = ['DEVICETYPE', 'BIDREQUESTIP', 'USERPLATFORMUID', 'USERCITY', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE', 'URL', 'KEYWORDS']
for column in string_columns:
    if column in X.columns:
        X[column] = LabelEncoder().fit_transform(X[column].astype(str))

In [262]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

In [263]:
# Train the model using mini-batch learning with SGDClassifier
model = SGDClassifier(loss='log', max_iter=1000, random_state=42)
batch_size = 1000
num_batches = int(np.ceil(X_train.shape[0] / batch_size))

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, X_train.shape[0])
    model.partial_fit(X_train[start_idx:end_idx], y_train[start_idx:end_idx], classes=np.unique(y_train))

y_test = y_test[~np.isnan(y_test)]
X_test = X_test[~np.isnan(y_test)]
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is hcp :", accuracy)



Accuracy is hcp : 0.9414587051285802


In [264]:
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9649220327082297


In [265]:
# Load the test dataset
test_data = pd.read_csv('Doceree-HCP-Test.csv', encoding='latin1')

# Preprocess categorical columns in the test dataset
X_test_categorical = test_data[categorical_columns].astype(str)

# Apply feature hashing to categorical columns in the test dataset
hashed_features_test = hasher.transform(X_test_categorical.values)

# Text-based features in the test dataset
text_features_test = test_data['USERZIPCODE'].astype(str) + ' ' + test_data['USERAGENT']
text_features_test = text_features_test.fillna('')  # Replace NaN values with an empty string
X_text_test = vectorizer.transform(text_features_test)

# Concatenate all features in the test dataset
X_test_processed = hstack((hashed_features_test, X_text_test))

# Preprocess string columns in the test dataset
for column in string_columns:
    if column in test_data.columns:
        test_data[column] = LabelEncoder().fit_transform(test_data[column].astype(str))

# Predict the target variable 'IS_HCP' using the trained Random Forest model
y_pred_is_hcp = model.predict(X_test_processed)


In [270]:
import joblib

# Fit the model to the training data
model.fit(X_train, y_train)

# Save the model to an HDF5 file
joblib.dump(model, 'random_forest_model.h5')


['random_forest_model.h5']

In [268]:
# Save the predicted target variable 'IS_HCP' to a CSV file
test_data['IS_HCP'] = y_pred_is_hcp
test_data.to_csv('predicted_IS_HCP.csv', index=False)