# Общая информация:
__Задачи, решаемые в ноутбуке:__

1) Отобрать вопросы, имеющие тег _Android_

2) Сбор ответов с флагом лучший ответ

3) Сбор вопросов, с короткими ответами

4) Разбить на категории с помощью regex

# Импорт библиотек

In [77]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup

pd.set_option("display.max_colwidth", None)

In [78]:
from postclassifier.postprocessor import *

# Фильтрация данных

In [115]:
def filter_column(df, column_name, text_to_filter, regex=False):
    before_filtering = len(df)
    df = df[
        ~df[f"{column_name}"].str.contains(f"{text_to_filter}", na=False, regex=regex)
    ]
    after_filtering = len(df)
    print(f"Deleted {before_filtering-after_filtering} rows")
    return df

In [116]:
with open(f"../../data/prepared/q_a_df.p", "rb") as f:
    q_a_df = pickle.load(f)

In [117]:
len(q_a_df)

2176164

In [118]:
q_a_df = q_a_df.dropna()

In [119]:
len(q_a_df)

2014516

## Вопросы _"android"_

In [120]:
q_a_df = q_a_df.loc[q_a_df.apply(lambda x: f"android" in x.Tag, axis=1)]

In [121]:
len(q_a_df)

128954

## Нет картинкам!

In [122]:
q_a_df = filter_column(q_a_df, "Q_Body", "<img")
q_a_df = filter_column(q_a_df, "A_Body", "<img")

Deleted 11046 rows
Deleted 1408 rows


In [123]:
len(q_a_df)

116500

## Ссылочку нельзя

In [124]:
q_a_df = filter_column(q_a_df, "A_Body", "<a href=")
q_a_df = filter_column(q_a_df, "Q_Body", "<a href=")

Deleted 35709 rows
Deleted 10823 rows


In [125]:
len(q_a_df)

69968

## No code вопросы

In [126]:
pre_code_regex = "</code></pre>"

In [127]:
q_a_df = filter_column(q_a_df, "Q_Body", pre_code_regex, regex=False)
q_a_df = filter_column(q_a_df, "A_Body", pre_code_regex, regex=False)

Deleted 49056 rows
Deleted 8756 rows


In [128]:
len(q_a_df)

12156

## Только лучшие (ответы)

Выберем только ответы с наибольшем score, а вопросы с положительным

In [129]:
questions_score = 0

In [130]:
q_a_df.query(f"Q_Score >= {questions_score}", inplace=True)

In [131]:
best_a_idx = q_a_df.groupby("Id")["A_Score"].idxmax()
q_a_df = q_a_df.loc[best_a_idx]

In [132]:
len(q_a_df)

8469

In [133]:
q_a_df.query(f"A_Score >= 1", inplace=True)
len(q_a_df)

4596

In [134]:
q_a_df.head(1)

Unnamed: 0,Id,Q_date_open,Q_Score,Q_Title,Q_Body,A_Score,A_Body,Tag
11466,146020,2008-09-28 14:53:56+00:00,51,Making Eclipse behave like Visual Studio,"<p>I'm doing some Android dev, and I much prefer Visual Studio, but I'll have to use <em>Eclipse</em> for this.</p>\n\n<p>Has anyone made a tool that switches <em>Eclipse</em> to look and behave more like visual studio? I mainly can't stand its <strong>clippyesqe</strong> suggestions on how I should program (Yes, I know I have not yet used that private field! Thanks Eclipse!), or its incredibly lousy <strong>intellisense</strong>.</p>\n\n<p>For example, in eclipse, if I don't type <code>this</code> first, its <strong>intellisense</strong> won't realize I want to look for locally scoped members. Also, the TAB to complete VS convention is drilled into my head, and <em>Eclipse</em> is ENTER to complete, I could switch everything by hand but that would take hours, and I was hoping someone had some sort of theme or something that has already done it :)</p>\n",13.0,"<p>Have you tried using the Visual Studio keybindings available in Eclipse Ganymede (3.4)?</p>\n\n<p>(You may want to know that ""IntelliSense"" is a Visual Studio-term, an probably unknown to anyone without Visual Studio-experience. ""Autocompletion"" is probably a more widely used term.)</p>\n","[java, android, eclipse, visual-studio, ide]"


## Убираем HTML

In [135]:
q_a_df["Q_Body"] = q_a_df["Q_Body"].apply(lambda x: BeautifulSoup(x).get_text())
q_a_df["Q_Title"] = q_a_df["Q_Title"].apply(lambda x: BeautifulSoup(x).get_text())
q_a_df["A_Body"] = q_a_df["A_Body"].apply(lambda x: BeautifulSoup(x).get_text())



In [136]:
q_a_df.Q_Body = q_a_df.Q_Body.apply(removeHTML)
q_a_df.A_Body = q_a_df.A_Body.apply(removeHTML)
q_a_df.Q_Title = q_a_df.Q_Title.apply(removeHTML)

## Длина, не главное

In [137]:
q_a_df["Q_len"] = q_a_df.Q_Body.apply(lambda x: len(x.split(" ")))
q_a_df["A_len"] = q_a_df.A_Body.apply(lambda x: len(x.split(" ")))

In [138]:
q_a_df.describe()

Unnamed: 0,Id,Q_Score,A_Score,Q_len,A_len
count,4596.0,4596.0,4596.0,4596.0,4596.0
mean,15112230.0,5.389034,4.390992,84.270017,67.956701
std,10121790.0,27.388736,19.016472,84.720918,62.068147
min,146020.0,0.0,1.0,4.0,1.0
25%,6781105.0,0.0,1.0,44.0,29.0
50%,12303930.0,1.0,2.0,68.0,50.0
75%,21929290.0,3.0,3.0,105.0,85.0
max,40117580.0,1015.0,650.0,2282.0,690.0


In [139]:
q_a_df.query(f"Q_len <= 150", inplace=True)

In [140]:
len(q_a_df)

4128

# Regex для категоризации вопросов

In [141]:
from postclassifier.api_change import API_change
from postclassifier.api_usage import API_usage
from postclassifier.conceptual import Conceptual
from postclassifier.discrepancy import Discrepancy
from postclassifier.documentation import Documentation
from postclassifier.errors import Errors
from postclassifier.review import Review

In [142]:
classifiers = [
    API_change(),
    API_usage(),
    Conceptual(),
    Discrepancy(),
    Documentation(),
    Errors(),
    Review(),
]

In [143]:
for cc in classifiers:
    q_a_df[f"{cc.name}"] = q_a_df.apply(
        lambda x: cc.classify(x.Q_Title, x.Q_Body), axis=1
    )

In [144]:
q_a_df.describe()

Unnamed: 0,Id,Q_Score,A_Score,Q_len,A_len,API_CHANGE,API_USAGE,CONCEPTUAL,DISCREPANCY,DOCUMENTATION,ERRORS,REVIEW
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,15011470.0,5.563711,4.458576,68.03343,64.219719,0.015262,0.157946,0.220446,0.117248,0.018653,0.07655,0.043605
std,10103320.0,28.336631,18.886734,33.359034,58.567757,0.122606,0.364734,0.414597,0.321755,0.135313,0.265909,0.204239
min,146020.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6716712.0,0.0,1.0,42.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12168100.0,1.0,2.0,63.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,21785780.0,3.0,3.0,91.25,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40073480.0,1015.0,650.0,150.0,690.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Вопросы с API_USAGE

In [151]:
df = q_a_df.loc[q_a_df.API_USAGE == 1]

In [152]:
len(df)

652

In [154]:
df.to_csv("../../data/prepared/df_150.csv")