In [108]:
import numpy as np
import pandas as pd
import scripts as sc

prefix = 'python'
# prefix = 'r'

pd.set_option('display.max_rows', 500)

## Abertura dos arquivos

In [45]:
dataAnswers = pd.read_csv('data/{}/Answers.csv'.format(prefix), encoding='latin-1')
dataQuestions = pd.read_csv('data/{}/Questions.csv'.format(prefix), encoding='latin-1')
dataTags = pd.read_csv('data/{}/Tags.csv'.format(prefix))

len(dataAnswers), len(dataQuestions), len(dataTags)

(987122, 607282, 1885078)

## Remover todos os valores nulos do conjunto de dados

In [46]:
dataAnswers = dataAnswers.dropna()
dataQuestions = dataQuestions.dropna()
dataTags = dataTags.dropna()

len(dataAnswers), len(dataQuestions), len(dataTags)

(981755, 601070, 1884635)

## Informações sobre os conjuntos de dados

In [47]:
dataAnswers.info(), dataQuestions.info(), dataTags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981755 entries, 0 to 987121
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Id            981755 non-null  int64  
 1   OwnerUserId   981755 non-null  float64
 2   CreationDate  981755 non-null  object 
 3   ParentId      981755 non-null  int64  
 4   Score         981755 non-null  int64  
 5   Body          981755 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 52.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 601070 entries, 0 to 607281
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Id            601070 non-null  int64  
 1   OwnerUserId   601070 non-null  float64
 2   CreationDate  601070 non-null  object 
 3   Score         601070 non-null  int64  
 4   Title         601070 non-null  object 
 5   Body          601070 non-null  object 
dtypes: float

(None, None, None)

## Adição das colunas: ano, mes e dia

In [48]:
dateQuestion = sc.getDateFormat(dataQuestions['CreationDate'].values)
dateAnswer = sc.getDateFormat(dataAnswers['CreationDate'].values)

dataQuestions['ano'] = dateQuestion[0][0][0]
dataQuestions['mes'] = dateQuestion[0][0][1]
dataQuestions['dia'] = dateQuestion[0][0][2]

dataAnswers['ano'] = dateAnswer[0][0][0]
dataAnswers['mes'] = dateAnswer[0][0][1]
dataAnswers['dia'] = dateAnswer[0][0][2]

## Tamanho dos conjuntos por ano

In [49]:
print('\t\tQuestões:\t\tPerguntas:')
for ano in range(2008, 2016+1):
    print('ano: {}\ttamanho: {}\t\ttamanho: {}'.format(ano, len(dataQuestions.loc[(dataQuestions['ano'] == ano)]), len(dataAnswers.loc[(dataAnswers['ano'] == ano)])))

		Questões:		Perguntas:
ano: 2008	tamanho: 1824		tamanho: 6847
ano: 2009	tamanho: 11461		tamanho: 35954
ano: 2010	tamanho: 26047		tamanho: 60370
ano: 2011	tamanho: 40391		tamanho: 83470
ano: 2012	tamanho: 60662		tamanho: 113695
ano: 2013	tamanho: 89375		tamanho: 149872
ano: 2014	tamanho: 109148		tamanho: 164860
ano: 2015	tamanho: 132161		tamanho: 193869
ano: 2016	tamanho: 130001		tamanho: 172818


## Total de questões para cada ano

In [50]:
dataQuestions['ano'].value_counts()

2015    132161
2016    130001
2014    109148
2013     89375
2012     60662
2011     40391
2010     26047
2009     11461
2008      1824
Name: ano, dtype: int64

## Estamos considerando que um usuário não novato (usuario veterano), os usuários do ano de 2008

In [51]:
veterans = dataQuestions.loc[dataQuestions['ano'] == 2008, 'OwnerUserId'].unique().tolist()

len(veterans)

853

In [53]:
dropVeteran = False

if dropVeteran:
    for idVeteran in dataQuestions.loc[dataQuestions['ano'] == 2008, 'OwnerUserId'].unique().tolist():
        
        for idx in dataQuestions.loc[dataQuestions['OwnerUserId'] == idVeteran].index:
            dataQuestions = dataQuestions.drop(index=idx)

len(dataQuestions)

601070

## Total de novatos nos anos

In [54]:
dataQuestions['Body'] = dataQuestions['Body'].str.lower()

In [122]:
data = []
libTerms = [
    ['sklearn', 'scikit-learn', 'scikits learn'], 
    ['matplotlib'], 
    ['pandas'],
    ['numpy'],
    ['flasky'],
]

for libs in libTerms:
    dataTopic = []
    qtdNewComers = 0
    qtdTotal = 0
    
    for lib in libs:
        dataQuestions['lib'] = dataQuestions['Body'].map(lambda val: val.count(lib)>0)
        
        if len(dataTopic) > 1:
            dataTopic.loc[dataTopic['lib'] == False, 'lib'] = dataQuestions.loc[dataQuestions['lib'] == True]
        else:
            dataTopic = dataQuestions.loc[dataQuestions['lib'] == True]

    for ano in range(min(dataQuestions['ano'])+1, max(dataQuestions['ano'])+1):
        dataAno = dataTopic.loc[dataTopic['ano'] == ano]

        if len(dataAno) < 1:

            for mes in range(1, 13):
                data.append([prefix, libs[0], ano, mes, qtdNewComers, qtdTotal])
        
        else:

            for mes in range(min(dataQuestions['mes']), max(dataQuestions['mes'])+1):
                dataMes = dataAno.loc[dataAno['mes'] == mes]
                
                if len(dataMes) < 1:
                    data.append([prefix, libs[0], ano, mes, qtdNewComers, qtdTotal])
                
                else:
                    for question in dataMes.values:
                        
                        if question[1] not in veterans:
                            qtdNewComers += 1
                        
                        qtdTotal += 1
                    
                    data.append([prefix, libs[0], ano, mes, qtdNewComers, qtdTotal])

dataLanguage = pd.DataFrame(data, columns=['Language', 'Lib', 'Year', 'Month', 'NewCommers', 'Total'])