Paper: https://www.andrew.cmu.edu/user/lakoglu/pubs/StackOverflow-churn.pdf

Description of datasets: https://ia800107.us.archive.org/27/items/stackexchange/readme.txt

Site for download of datasets: https://archive.org/details/stackexchange

This code has 6 steps

    1. Load StackOverflow datasets as dataframe
    2. Extract and label the datasets for each task
    3. Extract features for each task
    4. Analyze features
    5. Train models for each task with the features
    6. Quantify the importance of each feature category

1. Load StackOverflow datasets as dataframe

In [None]:
import sys
!{sys.executable} -m pip install xmltodict

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import xmltodict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def load_from_google_drive(dir_id):
    files = []
    file_list = gdrive.ListFile({'q': "'{}' in parents".format(dir_id)}).GetList()
    for f in file_list:
        if f['title'] in ['Users.xml', 'Posts.xml',
                          'users_reduce.pkl', 'posts_reduce.pkl']:
            print('  Load file: {}'.format(f['title']))
            f_ = drive.CreateFile({'id': f['id']})
            f_.GetContentFile(f['title'])
            files.append(f['title'])
    return files


# load_from_google_drive('1Fp_7GDH_t7xfnU8aXeKrcBC54_nECOcu')  ### Full dataset
# load_from_google_drive('1haYAgCV-TqTMYIk8N4eGE9H4hY2np5xr')   ### Small dataset
load_from_google_drive('1CRE27AaxJuX-9Kxtgk2GnmxQt6ECHeJS')   ### Tiny dataset


In [None]:
# Read xml file and transform to pandas dataframe

def xml2df(xml_path):
    f = open(xml_path)

    return

In [None]:
# 1. Read Users.xml

xml_path = 'Users.xml'
users_df = xml2df(xml_path)

# 2. Change data type of columns


In [None]:
# 1. Read Posts.xml

xml_path = 'Posts.xml'
posts_df = xml2df(xml_path)

# 2. Change data type of columns


In [None]:
# Save and Load dataframe
from google.colab import drive
drive.mount('/content/gdrive')

def save_df(df, filename):
    df.to_pickle("{}.pkl".format(filename))

    
def load_df(filename):
    return pd.read_pickle("{}.pkl".format(filename))

  


2. Extract and label the datasets for each tasks

You should extract the dataset for the period of the dataset: July 31, 2008 ~  July 31, 2012 

There are 2 tasks:

    A. After a user's K-th post, predict how likely it is that the user will churn
    B. After the T-th day from the account creation of a user, predict how likely it is that the user will churn

In [None]:
# You should extract the dataset for the period of the dataset: July 31, 2008 ~  July 31, 2012

start_time = pd.to_datetime('2008-07-31')
end_time = pd.to_datetime('2012-07-31')

posts_df = 
users_df = 

In [None]:
# Dataset in Task 1
#   Posts: Extract K posts of each user
#   Users: Extract users who post at least K

def getTask1Posts(posts, K=20):
    return


def getTask1Users(users, posts, K=20):
    return

In [None]:
# Dataset in Task 2
#   Users: Extract users who post at least 1
#   Posts: Extract posts which create before T day from the account creation of the owner

def getTask2Posts(users, posts, T=30):
    return

In [None]:
# Churn in Task 1
#   Churners: Users who did not post for at least 6 months from their K-th post 
#   Stayers:  Users who created at least one post within the 6 months from their K-th post

def getTask1Labels(users, posts, K=20):
    label_df = users.drop(users.columns, axis=1)
    label_df = getTask1Users(label_df, posts, K=K)

    label_df['is_churn'] = 0.0
    return


In [None]:
# Churn in Task2
#   Churners: Users who did not post for at least 6 months from T days after account creation
#   Stayers:  Users who created at least one post within the 6 months from T days after account creation

def getTask2Labels(users, posts, T=30):
    label_df = users.drop(users.columns, axis=1)
    label_df = getTask1Users(label_df, posts, K=1)

    label_df['is_churn'] = 0.0
    return label_df


3. Extract features for each task

3-1. Temporal features

In [None]:
# Temporal features 1: gap1
def getTimeGap1OfUser(users, posts):
    return


In [None]:
# Temporal features 2: gapK
def getTimeGapsOfPosts(posts, K):
    pass


In [None]:
# Temporal features 3: last_gap
def getTimeLastGapOfPosts(posts):
    return


In [None]:
# Temporal features 4: time_since_last_post
def getTimeSinceLastPost(users, posts):
    return


In [None]:
# Temporal features 5: mean_gap
def getTimeMeanGap(posts):
    return


3-2. Frequency features

In [None]:
# Frequency features 1: num_answers
# Frequency features 2: num_questions
def getNumAnswers(posts):
    return

def getNumQuestions(posts):
    return


In [None]:
# Frequency features 3: ans_ques_ratio
def getAnsQuesRatio(num_answers, num_questions):
    return


In [None]:
# Frequency features 4: num_posts
def getNumPosts(posts):
    return


3-3. Knowledge features

In [None]:
# Knowledge features 1: accepted_answerer_rep
def getRepOfAcceptedAnswerer(users, posts):
    return


In [None]:
# Knowledge features 2: max_rep_answerer 
def getMaxRepAmongAnswerer(users, posts):
    return


In [None]:
# Knowledge features 3: num_que_answered
def getNumQueAnswered(posts):
    return


In [None]:
# Knowledge features 4: time_for_first_ans
def getTimeForFirstAns(posts):
    return


In [None]:
# Knowledge features 5: rep_questioner
def getAvgRepOfQuestioner(users, posts):
    return


In [None]:
# Knowledge features 6: rep_answerers
def getAvgRepOfAnswerer(users, posts):
    return


In [None]:
# Knowledge features 7: rep_co_answerers
def getAvgRepOfCoAnswerer(users, posts):
    return


In [None]:
# Knowledge features 8: num_answers_recvd
def getAvgNumAnsReceived(posts):
    return


3-4. Speed features

In [None]:
# Speed features 1: answering_speed
def getAnsweringSpeed(posts):
    return


3-5. Quality features

In [None]:
# Quality features 1: ans_score
# Quality features 2: que_score
def getScoreOfAnswers(posts):
    return

def getScoreOfQuestions(posts):
    return


3-6. Consistency features

In [None]:
# Consistency features 1: ans_stddev
# Consistency features 2: que_stddev
def getStdevOfScoresOfAnswers(posts):
    return

def getStdevOfScoresOfQuestions(posts):
    return


3-7. Gratitude features

In [None]:
# Gratitude features 1: ans_comments
# Gratitude features 2: que_comments
def getAvgNumOfAnswers(posts):
    return

def getAvgNumOfQuestions(posts):
    return


3-8. Competitiveness features

In [None]:
# Competitiveness features 1: relative_rank_pos
def getRelRankPos(posts):
    return


3-9. Content features

In [None]:
# Content features 1: ans_length
# Content features 2: que_length
def getLengthOfAnswers(posts):
    return

def getLengthOfQuestions(posts):
    return


3-10. Extract all features for each tasks

In [None]:
def getFeatures(features, users, posts, task, K=None, T=None):
    assert(task in [1,2])
    
    if -1 in features.index:
        features = features.drop([-1])
    
    return features

In [None]:
task1_features = []
for K in range(1, 20+1):
    task1_features.append()
    
task2_features = []
for T in [7, 15, 30]:
    task2_features.append()

4. Analyze features


In [None]:
# Figure 2: Gap between posts
#    For a user who churns, gap between consecutive posts keeps increasing. 
#    Gaps for those who stay are much lower, and stabilize around 20,000 minutes,
#      indicating routine posting activity in every ≈2 weeks.

for K in range(2, 21):
    pass

In [None]:
# Figure 3: # Answers vs Churn probability
#    The probability of churning for a user decreases the more answers s/he provides.
#    It is even lower if s/he asks more questions alongside.

for features in task2_features:
    pass

In [None]:
# Figure 4: K vs Time taken for the first answer to arrive
#    The more the time taken for a user to receive an answer, 
#      the lesser the satisfaction level and the more the chances of churning.


5. Train models for each tasks with the features

    1. Decision Tree
    2. SVM (Linear)
    3. SVM (RBF)
    4. Logistic Regression
    

In [None]:
# Table 2: Performance on Task 1

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

seed = 1234

for i, features in enumerate(task1_features):
    pass

In [None]:
# Table 3: Performance on Task 2

for i, features in enumerate(task2_features):
    pass

6. Draw the graphs in the paper


In [None]:
# Table 4: Temporal Features Analysis

for i, features in enumerate(task1_features):
    pass

In [None]:
# Figure 5: Churn prediction accuracy when features from each category are used in isolation
