In [1]:
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb

In [2]:
import re
import numpy as np
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
import joblib
from imblearn.over_sampling import SMOTE, ADASYN

In [3]:
def recognize_age(text):
    
    '''
    Use regular expressions to recognize age in the text
    Input: text
    Output: recognized numeric age, or 0 indicating missing
    '''
    
    match = re.search("\d+[\s\-]?year[s]?[\s\-]?[old]?",text)
    if match:
        age = re.search("\d+",match[0])
        age_numeric = age[0]
        return age_numeric
    else:
        return 0

In [4]:
def check_age_diff_numeric(age_in_query, text):
    
    '''
    check the numerical age difference
    '''
    
    age_in_text = recognize_age(text)
    
    if age_in_text != 0:
        return 1/(1 + np.exp( - abs(age_in_query-int(age_in_text))) )   
    else:
        # the age is not reflected in the text
        return "MissingNumericAge"

In [5]:
def count_age_group_keywords_text(age_in_query, text):
    
    '''
    count the keywords of different age groups in the text
    '''
    
    counter_array = np.array([])
    text = re.sub(r"[^\w\s]", " ", text)
    text = text.lower()
    
    # infant
    infant_match = len(re.findall(r"\binfant[s]?\b",text))
    counter_array = np.append(counter_array, infant_match)
    
    # child
    child_match1 = re.findall(r'\bchildren\b',text)
    child_match2 = re.findall(r'\bchild\b',text)
    child_match3 = re.findall(r'\btoddler[s]?\b',text)
    child_match4 = re.findall(r'\bkid[s]?\b',text)
    child_match5 = re.findall(r'\bpreschool\b',text)
    child_match = len(child_match1)+len(child_match2)+len(child_match3)+len(child_match4)+len(child_match5) 
    counter_array = np.append(counter_array, child_match)
    
    # teenager/adolescent
    teen_match1 = re.findall(r'\bteen[s]?\b',text)
    teen_match2 = re.findall(r'\bteenager[s]?\b',text)
    teen_match3 = re.findall(r'\badolescent[s]?\b',text)
    teen_match = len(teen_match1)+len(teen_match2)+len(teen_match3)
    counter_array = np.append(counter_array, teen_match)
    
    # young
    young_match1 = re.findall(r'\byoung?\b',text)
    young_match2 = re.findall(r'\byounger?\b',text)
    young_match3 = re.findall(r'\byoungest?\b',text)
    young_match = len(young_match1)+len(young_match2)+len(young_match3)
    counter_array = np.append(counter_array, young_match)
    
    # adult/middle aged
    adult_match1 = re.findall(r'\badult[s]?\b',text)
    adult_match2 = re.findall(r'\b[^\s]*middle[^s]*age[^\s]*\b', text)
    adult_match3 = re.findall(r'\bmiddleaged\b', text)
    adult_match = len(adult_match1)+len(adult_match2)+len(adult_match3)
    counter_array = np.append(counter_array, adult_match)
    
    # aged
    old_match1 = re.findall(r'\bold\b',text)
    old_match2 = re.findall(r'\bolder\b',text) 
    old_match3 = re.findall(r'\belderly\b',text)
    old_match3 = re.findall(r'\baged\b',text)
    old_match4 = re.findall(r'\bold[^s]*age[^\s]*\b', text)
    old_match = len(old_match1) + len(old_match2) + len(old_match3) + + len(old_match4)
    counter_array = np.append(counter_array, old_match)
    
    if (infant_match+child_match+teen_match+young_match+adult_match+old_match) == 0: # there is no age info
        return "MissingTextAge"
    else:
        if age_in_query <= 1:
            return 1 if max(counter_array) == infant_match else 0
        
        elif (age_in_query >= 2) and (age_in_query <= 12):
            return 1 if max(counter_array) == child_match else 0
            
        elif (age_in_query >= 13) and (age_in_query <= 18):
            return 1 if max(counter_array) == teen_match else 0
            
        elif (age_in_query >= 19) and (age_in_query <= 24):
            return 1 if max(counter_array) == young_match else 0
        
        elif (age_in_query >= 25) and (age_in_query <= 60):
            return 1 if max(counter_array) == adult_match else 0
        
        else:
            return 1 if max(counter_array) == old_match else 0

In [6]:
def check_gender_diff(gender_in_query, text):
    
    '''
    check gender
    '''
    
    female_match1 = re.findall(r'\bwoman\b',text)
    female_match2 = re.findall(r'\bwomen\b',text)
    female_match3 = re.findall(r'\bfemale[s]?\b',text)
    female_match4 = re.findall(r'\bgirl[s]?\b',text)

    male_match1 = re.findall(r'\bman\b',text)
    male_match2 = re.findall(r'\bmen\b',text)
    male_match3 = re.findall(r'\bmale[s]?\b',text)
    male_match4 = re.findall(r'\bboy[s]?\b',text)
    
    female_count = len(female_match1) + len(female_match2) + len(female_match3) + len(female_match4)
    male_count = len(male_match1) + len(male_match2) + len(male_match3) + len(male_match4)
    
    if male_count + female_count >= 1:
    # there is gender info
        if gender_in_query == "male":
            if male_count > female_count:
                return 1
            else:
                return 0
        if gender_in_query == "female":
            if female_count > male_count:
                return 1
            else:
                return 0
    else:
        return "MissingGender"

In [6]:
def generate_features(year):
    
    # read dataframe and parsing
    df = read_dataframe(year, 'demo')
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    # read query topics
    query_topics=read_query_topics(year,"demo")
    
    # corpus folder path
    folder_path = "../../../data/corpus/"
    
    feature_table_columns = ["age_diff_numeric", "age_missing_numeric" ,
                             "age_missing_text", "age_match_text",
                             "gender_diff", "gender_missing",
                             "topicid","docid","label"]
    feature_table = pd.DataFrame(columns=feature_table_columns)
    
    print("Parsing year {}".format(year))
    
    # iterate over topics
    for topic in set(df.trec_topic_number):
        
        demo = query_topics[topic]
        age = int(demo.split("-")[0])
        gender = demo.split(" ")[1]
        df_topic = df.loc[df.trec_topic_number == topic]
        
        for index, rows in df_topic.iterrows():
            docid = str(rows["trec_doc_id"])
            content = []
            with open (folder_path+docid+".txt", 'r') as f:
                for line in f.readlines():
                    content.append(line.strip())
            f.close()
            text = " ".join(content)

            # check numeric age
            age_numeric = check_age_diff_numeric(age, text)
            if age_numeric == "MissingNumericAge":
                (age_numeric_missing, age_numeric_diff) = (1, 0)
            else:
                (age_numeric_missing, age_numeric_diff) = (0, age_numeric)

            # check text age
            age_text = count_age_group_keywords_text(age, text)
            if age_text == "MissingTextAge":
                (age_text_missing, age_text_match) = (1, 0)
            else:
                (age_text_missing, age_text_match) = (0, age_text)

            # check geneder
            gender_check = check_gender_diff(gender, text)
            if gender_check == "MissingGender":
                (gender_missing, gender_diff) = (1, 0)
            else:
                (gender_missing, gender_diff) = (0, gender_check)

            
            topicid = rows["trec_topic_number"]
            label = rows["demographics_desc"]
            new_record = {"age_missing_numeric" : age_numeric_missing, 
                          "age_diff_numeric": age_numeric_diff,
                          "age_missing_text": age_text_missing, 
                          "age_match_text": age_text_match,
                          "gender_missing": gender_missing, 
                          "gender_diff": gender_diff,
                          "topicid":topicid,
                          "docid":docid,
                          "label":label}

            feature_table = feature_table.append(new_record, ignore_index=True)
            
        print("Topic {} has been parsed".format(topic))
    
    assert df.shape[0] == feature_table.shape[0]
    
    feature_table.to_csv(str(year) + ".demo.features.csv", index=False, sep = ",")

In [1]:
# generate_features(2017)
# generate_features(2018)

In [3]:
def training_testing_classifier(training_years, testing_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".demo.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    df_testing=pd.read_csv(str(testing_years)+".demo.features.csv")
    
    features = ["age_diff_numeric", "age_missing_numeric" ,
                "age_missing_text", "age_match_text",
                "gender_diff", "gender_missing"]
    training_features=df_training[features]
    testing_features=df_testing[features]
    training_labels=df_training.label
    testing_labels=df_testing.label
    
    # over-sampling
    training_features, training_labels = SMOTE().fit_resample(training_features, training_labels)
    
    # training
    logistic_model = LogisticRegression(multi_class="ovr",C=0.5)
    logistic_model.fit(training_features, training_labels)
    
    predicted_labels = logistic_model.predict(testing_features)
    print(classification_report(testing_labels, predicted_labels))
    
    joblib.dump(logistic_model, str(testing_years)+".demo.classifier.pkl")

In [4]:
# training_testing_classifier([2017],2018)

               precision    recall  f1-score   support

     Excludes       0.22      0.28      0.25       601
      Matches       0.27      0.42      0.32       800
Not Discussed       0.90      0.81      0.85      6704

     accuracy                           0.74      8105
    macro avg       0.46      0.51      0.48      8105
 weighted avg       0.79      0.74      0.76      8105



In [2]:
def print_original_distribution(training_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".demo.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    
    print(df_training.label.value_counts())

In [3]:
# print_original_distribution([2017])

Not Discussed    7126
Excludes          815
Matches           607
Name: label, dtype: int64
