# Общая информация:
__Задачи, решаемые в ноутбуке:__

1) Отобрать вопросы, имеющие тег _Android_

2) Сбор ответов с флагом лучший ответ

3) Сбор вопросов, с короткими ответами

4) Разбить на категории с помощью regex

# Импорт библиотек

In [73]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup

pd.set_option("display.max_colwidth", None)

In [74]:
from postclassifier.postprocessor import *

# Фильтрация данных

In [75]:
def filter_column(df, column_name, text_to_filter, regex=False):
    before_filtering = len(df)
    df = df[
        ~df[f"{column_name}"].str.contains(f"{text_to_filter}", na=False, regex=regex)
    ]
    after_filtering = len(df)
    print(f"Deleted {before_filtering-after_filtering} rows")
    return df

In [76]:
with open(f"../../data/prepared/q_a_df.p", "rb") as f:
    q_a_df = pickle.load(f)

In [None]:
len(q_a_df)

2176164

In [None]:
q_a_df = q_a_df.dropna()

In [None]:
len(q_a_df)

2014516

## Вопросы _"android"_

In [None]:
q_a_df = q_a_df.loc[q_a_df.apply(lambda x: f"android" in x.Tag, axis=1)]

In [None]:
len(q_a_df)

128954

## Нет картинкам!

In [None]:
q_a_df = filter_column(q_a_df, "Q_Body", "<img")
q_a_df = filter_column(q_a_df, "A_Body", "<img")

Deleted 11046 rows
Deleted 1408 rows


In [None]:
len(q_a_df)

116500

## Ссылочку нельзя

In [None]:
q_a_df = filter_column(q_a_df, "A_Body", "<a href=")
q_a_df = filter_column(q_a_df, "Q_Body", "<a href=")

Deleted 35709 rows
Deleted 10823 rows


In [None]:
len(q_a_df)

69968

## No code вопросы

In [None]:
pre_code_regex = "</code></pre>"

In [None]:
q_a_df = filter_column(q_a_df, "Q_Body", pre_code_regex, regex=False)
q_a_df = filter_column(q_a_df, "A_Body", pre_code_regex, regex=False)

Deleted 49056 rows
Deleted 8756 rows


In [None]:
len(q_a_df)

12156

## Только лучшие (ответы)

Выберем только ответы с наибольшем score, а вопросы с положительным

In [None]:
questions_score = 0

In [None]:
q_a_df.query(f"Q_Score >= {questions_score}", inplace=True)

In [None]:
best_a_idx = q_a_df.groupby("Id")["A_Score"].idxmax()
q_a_df = q_a_df.loc[best_a_idx]

In [None]:
len(q_a_df)

8469

In [None]:
q_a_df.query(f"A_Score >= 1", inplace=True)
len(q_a_df)

4596

In [None]:
q_a_df.head(1)

Unnamed: 0,Id,Q_date_open,Q_Score,Q_Title,Q_Body,A_Score,A_Body,Tag
11466,146020,2008-09-28 14:53:56+00:00,51,Making Eclipse behave like Visual Studio,"<p>I'm doing some Android dev, and I much prefer Visual Studio, but I'll have to use <em>Eclipse</em> for this.</p>\n\n<p>Has anyone made a tool that switches <em>Eclipse</em> to look and behave more like visual studio? I mainly can't stand its <strong>clippyesqe</strong> suggestions on how I should program (Yes, I know I have not yet used that private field! Thanks Eclipse!), or its incredibly lousy <strong>intellisense</strong>.</p>\n\n<p>For example, in eclipse, if I don't type <code>this</code> first, its <strong>intellisense</strong> won't realize I want to look for locally scoped members. Also, the TAB to complete VS convention is drilled into my head, and <em>Eclipse</em> is ENTER to complete, I could switch everything by hand but that would take hours, and I was hoping someone had some sort of theme or something that has already done it :)</p>\n",13.0,"<p>Have you tried using the Visual Studio keybindings available in Eclipse Ganymede (3.4)?</p>\n\n<p>(You may want to know that ""IntelliSense"" is a Visual Studio-term, an probably unknown to anyone without Visual Studio-experience. ""Autocompletion"" is probably a more widely used term.)</p>\n","[java, android, eclipse, visual-studio, ide]"


## Убираем HTML

In [None]:
q_a_df["Q_Body"] = q_a_df["Q_Body"].apply(lambda x: BeautifulSoup(x).get_text())
q_a_df["Q_Title"] = q_a_df["Q_Title"].apply(lambda x: BeautifulSoup(x).get_text())
q_a_df["A_Body"] = q_a_df["A_Body"].apply(lambda x: BeautifulSoup(x).get_text())

KeyboardInterrupt: 

In [8]:
q_a_df.Q_Body = q_a_df.Q_Body.apply(removeHTML)
q_a_df.A_Body = q_a_df.A_Body.apply(removeHTML)
q_a_df.Q_Title = q_a_df.Q_Title.apply(removeHTML)

## Длина, не главное

In [None]:
q_a_df["Q_len"] = q_a_df.Q_Body.apply(lambda x: len(x.split(" ")))
q_a_df["A_len"] = q_a_df.A_Body.apply(lambda x: len(x.split(" ")))

In [None]:
q_a_df.describe()

Unnamed: 0,Id,Q_Score,A_Score,Q_len,A_len
count,4596.0,4596.0,4596.0,4596.0,4596.0
mean,15112230.0,5.389034,4.390992,84.603133,68.28503
std,10121790.0,27.388736,19.016472,88.908595,62.4326
min,146020.0,0.0,1.0,4.0,1.0
25%,6781105.0,0.0,1.0,44.0,29.0
50%,12303930.0,1.0,2.0,68.0,51.0
75%,21929290.0,3.0,3.0,105.0,86.0
max,40117580.0,1015.0,650.0,2706.0,690.0


In [60]:
q_a_df.query(f"Q_len <= 200", inplace=True)

In [81]:
len(q_a_df)

2176164

# Regex для категоризации вопросов

In [63]:
from postclassifier.api_change import API_change
from postclassifier.api_usage import API_usage
from postclassifier.conceptual import Conceptual
from postclassifier.discrepancy import Discrepancy
from postclassifier.documentation import Documentation
from postclassifier.errors import Errors
from postclassifier.review import Review

In [64]:
classifiers = [
    API_change(),
    API_usage(),
    Conceptual(),
    Discrepancy(),
    Documentation(),
    Errors(),
    Review(),
]

In [65]:
for cc in classifiers:
    q_a_df[f"{cc.name}"] = q_a_df.apply(
        lambda x: cc.classify(x.Q_Title, x.Q_Body), axis=1
    )

In [66]:
q_a_df.describe()

Unnamed: 0,Id,Q_Score,A_Score,Q_len,A_len,API_CHANGE,API_USAGE,CONCEPTUAL,DISCREPANCY,DOCUMENTATION,ERRORS,REVIEW
count,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0,4403.0
mean,15093130.0,5.453554,4.44288,74.638655,66.43107,0.015898,0.160572,0.229162,0.123552,0.019305,0.076993,0.048149
std,10110600.0,27.944757,19.378733,41.157083,60.728984,0.125096,0.367178,0.420341,0.329107,0.13761,0.266611,0.214105
min,146020.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6773580.0,0.0,1.0,43.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12282290.0,1.0,2.0,66.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,21818840.0,3.0,3.0,99.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40117580.0,1015.0,650.0,200.0,690.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Вопросы с API_USAGE

In [69]:
df = q_a_df.loc[q_a_df.API_USAGE == 1]

In [75]:
len(df)

707

In [76]:
df.describe()

Unnamed: 0,Id,Q_Score,A_Score,Q_len,A_len,API_CHANGE,API_USAGE,CONCEPTUAL,DISCREPANCY,DOCUMENTATION,ERRORS,REVIEW
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0
mean,14142170.0,4.763791,4.475248,80.202263,67.577086,0.019802,1.0,0.175389,0.104668,0.028289,0.056577,0.043847
std,9584925.0,18.597022,26.425434,41.465984,66.250133,0.139418,0.0,0.380569,0.306341,0.165913,0.231196,0.2049
min,921130.0,0.0,1.0,5.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,6289765.0,0.0,1.0,49.0,28.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,11470300.0,1.0,1.0,72.0,50.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,19922310.0,2.5,2.0,103.0,82.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,39905040.0,276.0,650.0,200.0,690.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [74]:
df.iloc[0]['Q_Body']

"I don't have any of the devices to test at the moment. I guess I'll start using the emulators later on.We're looking to offer mobile support. I was wondering how jQuery or even javascript renders in their respective browsers. What works? What doesn't? Any tips? Advice?"

In [154]:
df.to_csv("../../data/prepared/df_200.csv")