In [1]:
import pandas as pd
import numpy as np
from glob import glob

In [2]:
def combine_pickles(path):
    files = glob(path)
    dfs = []
    for i, f in enumerate(files):
        dfs.append(pd.read_pickle(f))
    df = pd.concat(dfs, ignore_index=True)
    return df
    
df = combine_pickles('./panda_pickles/*.p')

In [3]:
print(df.shape)

(4450000, 9)


# Remove all cells with wrong language

In [4]:
bad_lan = ['ar-in',  'ar-us', 'de-de', 'es-e1', 'en-ph',\
 'es-es', 'es-mx', 'es-us', 'fa-in', 'fr-fr', 'in-us',\
 'it-in', 'it-it', 'no-es', 'pt-br', 'ro-es']

In [5]:
df['subject'] = df['subject'].str.lower()
df['content'] = df['content'].str.lower()
df['bestanswer'] = df['bestanswer'].str.lower()

In [6]:
df = df[~df['language'].isin(bad_lan)]

In [7]:
print(df.shape)

(3924824, 9)


# Remove cells without best answer

In [8]:
df = df[df.bestanswer != '']

In [9]:
print( df.shape)

(3850716, 9)


# Preprocess

In [15]:
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import re

In [16]:
def sanitize_question(question):
    # remove special characters and stop words
    stop = stopwords.words('english') + ['$','%', '(', ')', '^', '*', '&', '±', '§', '<', '>', '#', '@', '\n', '\\']
    sanitized_question = " ".join([i for i in word_tokenize(question) if i not in stop])
    sanitized_question = SnowballStemmer("english").stem(sanitized_question)
    return sanitized_question

In [17]:
def preprocess(key = ''):
    values = df[key].values
    temp = ['ok']*len(df)
    for i, v in enumerate(values):
        temp[i] = sanitize_question(v)
        if i % 100000 == 0:
            print(i)
    return temp

# Preprocess

In [20]:
# temp_subjects = preprocess(key = 'subject')

In [19]:
df['subject'] = temp_subjects

In [38]:
reset_selective temp_subjects

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [41]:
# temp_content = preprocess(key = 'content')

In [40]:
df['content'] = temp_content

In [53]:
reset_selective temp_content

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [61]:
# temp_bestanswer = preprocess(key = 'bestanswer')

In [60]:
df['bestwanswer'] = temp_bestanswer

In [58]:
# df['bestwanswer']

In [57]:
reset_selective temp_bestanswer

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


# dataframe with only subject

In [70]:
df_subject = df[['subject', 'bestanswer']].copy()

In [75]:
df_subject.to_pickle('yahoo_df_subject.p')

# Dataframe with only subject with empty content

In [71]:
df_no_content = df[df.content == '']

In [72]:
df_no_content = df_no_content[['subject', 'bestanswer']].copy()

In [74]:
df_no_content.to_pickle('yahoo_df_subject_no-content.p')

(3850716, 10)

# Dataframe with subject+content

In [76]:
df['subject_content'] = df['subject'] + ' ' + df['content']

In [77]:
df_subject_content = df[['subject_content', 'bestanswer']].copy()

In [78]:
df_subject_content.to_pickle('yahoo_df_subject+content.p')

# sanity check

In [86]:
df_subject_content.shape

(3850716, 2)

In [85]:
df_subject.shape

(3850716, 2)

In [84]:
df_no_content.shape

(1681563, 2)