In [1]:
from os import listdir
import pandas as pd 
import re
from collections import Counter
from nltk import tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from stylometry.stylometry.extract import *
import spacy
import en_core_web_sm
from utilities import *

# ML
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
def load_data_to_pandas():
    data = {}
    for file_name in listdir('data/green roofs/'):
        text = open('data/green roofs/'+file_name, "r").read()
        data[file_name.replace('.txt', '')] = text
    data_frame = pd.DataFrame(data.items(), columns=['Source', 'Paper'])
    return data_frame
df = load_data_to_pandas()


In [3]:
df

Unnamed: 0,Source,Paper
0,"Razzaghmanesh, Beecham, Salemi (2016)","Urbanisation growth, climate change and water ..."
1,"Liu, Feng, Chen, Wei, Deo (2019)",Although rapid urbanization warrants a need to...
2,"Soulis, Ntoulas, Nektarios & Kargas (2017)",Green roofs are emerging as one of the most pr...
3,"Morakinyo, Dahanayake, Ng, Lun Chow (2017)","As a consequence of rapid urbanization, many g..."
4,"Herrera-Gomez, Quevedo-Nolasco, Perez-Urrestar...",Numerous studies on climate change predict a g...
5,"Tam, Wang, Le (2016 )","In Hong Kong, high pollution and lack of green..."
6,Rasul & Arutla (2020),"Industrialisation, increase in luxury of the p..."
7,"Chaudhary, Sandall, Lazarski (2019 )",Land-system conversion through urbanization re...
8,Vijayaraghavan (2016),"As a result of rapid economic growth, many cou..."
9,Lee & Jim (2018),Compact cities are susceptible to the double ...


In [4]:
df['alpha-chars-ratio'] = df['Paper'].apply(alpha_chars_ratio)
df['digit-chars-ratio'] = df['Paper'].apply(digit_chars_ratio)
df['upper-chars-ratio'] = df['Paper'].apply(upper_chars_ratio)
df['white-chars-ratio'] = df['Paper'].apply(white_chars_ratio)
df['number-of-words'] = df['Paper'].apply(total_number_of_words)
df['size-of-vocabulary'] = df['Paper'].apply(size_of_vocabulary)
df['type-token-ratio'] = df['Paper'].apply(lexical_diversity)
df['hapax-legomena'] = df['Paper'].apply(hapax_legomena)
df['hapax-dislegomena'] = df['Paper'].apply(hapax_dislegomena)
df['mean-word-length'] = df['Paper'].apply(mean_word_length)
df['mean-sentence-char-length'] = df['Paper'].apply(average_sentence_char_length)
df['mean-sentence-word-length'] = df['Paper'].apply(average_sentence_word_length)
df['mean-paragraph-length'] = df['Paper'].apply(mean_paragraph_len)
df['exclamation_mark_rate'] = df['Paper'].apply(exclamation_mark_rate)
df['question-mark-rate'] = df['Paper'].apply(question_mark_rate)
df['adverbs-rate'] = df['Paper'].apply(number_of_adverbs)
df['adjectives-rate'] = df['Paper'].apply(number_of_adjectives)
df['nouns-rate'] = df['Paper'].apply(number_of_nouns)
df['prepositions-rate'] = df['Paper'].apply(number_of_prepositions)
df['conjuctions-rate'] = df['Paper'].apply(number_of_conjuctions)
df['verb-rate'] = df['Paper'].apply(number_of_verbs)
df['adjectivs-adverbs-rate'] = df['Paper'].apply(adj_adv)
df['named-entites-count'] = df['Paper'].apply(number_of_ne)

In [5]:
df

Unnamed: 0,Source,Paper,alpha-chars-ratio,digit-chars-ratio,upper-chars-ratio,white-chars-ratio,number-of-words,size-of-vocabulary,type-token-ratio,hapax-legomena,...,exclamation_mark_rate,question-mark-rate,adverbs-rate,adjectives-rate,nouns-rate,prepositions-rate,conjuctions-rate,verb-rate,adjectivs-adverbs-rate,named-entites-count
0,"Razzaghmanesh, Beecham, Salemi (2016)","Urbanisation growth, climate change and water ...",0.774358,0.033204,0.021086,0.15681,646,295,41.21447,204,...,0.0,0.0,0.005574,0.01745,0.030053,0.017693,0.007271,0.01745,0.023025,0.020843
1,"Liu, Feng, Chen, Wei, Deo (2019)",Although rapid urbanization warrants a need to...,0.784761,0.026534,0.013045,0.153128,1030,431,36.696501,292,...,0.0,0.0,0.006819,0.020605,0.031574,0.015713,0.006671,0.017492,0.027424,0.015861
2,"Soulis, Ntoulas, Nektarios & Kargas (2017)",Green roofs are emerging as one of the most pr...,0.76736,0.038839,0.018831,0.152413,774,323,36.008448,213,...,0.0,0.0,0.007454,0.016281,0.033346,0.014123,0.006277,0.019419,0.023735,0.021381
3,"Morakinyo, Dahanayake, Ng, Lun Chow (2017)","As a consequence of rapid urbanization, many g...",0.812384,0.010906,0.008503,0.145841,787,375,43.326039,266,...,0.0,0.0,0.003882,0.018854,0.041959,0.019593,0.006839,0.025323,0.022736,0.004991
4,"Herrera-Gomez, Quevedo-Nolasco, Perez-Urrestar...",Numerous studies on climate change predict a g...,0.800773,0.009198,0.010302,0.16262,881,379,39.27522,261,...,0.0,0.0,0.004047,0.018948,0.038447,0.022995,0.006255,0.022995,0.022995,0.00791
5,"Tam, Wang, Le (2016 )","In Hong Kong, high pollution and lack of green...",0.811107,0.010968,0.011316,0.151114,863,411,43.838384,288,...,0.0,0.0,0.005223,0.023155,0.037779,0.017584,0.007834,0.024547,0.028377,0.006442
6,Rasul & Arutla (2020),"Industrialisation, increase in luxury of the p...",0.806493,0.004137,0.012094,0.16168,507,238,43.367347,159,...,0.0,0.0,0.002228,0.019733,0.042011,0.021324,0.007002,0.021961,0.021961,0.007002
7,"Chaudhary, Sandall, Lazarski (2019 )",Land-system conversion through urbanization re...,0.798241,0.020029,0.017831,0.142648,583,317,45.643154,229,...,0.0,0.0,0.009038,0.021739,0.025403,0.013923,0.008061,0.018564,0.030777,0.012702
8,Vijayaraghavan (2016),"As a result of rapid economic growth, many cou...",0.821545,0.003999,0.007248,0.150962,602,290,45.123726,204,...,0.0,0.0,0.007498,0.027993,0.036741,0.015746,0.006998,0.022494,0.035491,0.002
9,Lee & Jim (2018),Compact cities are susceptible to the double ...,0.789193,0.023338,0.025875,0.152461,599,303,46.12069,217,...,0.0,0.0,0.005835,0.022577,0.038052,0.016489,0.006088,0.018772,0.028412,0.014967


In [6]:
df.describe()

Unnamed: 0,alpha-chars-ratio,digit-chars-ratio,upper-chars-ratio,white-chars-ratio,number-of-words,size-of-vocabulary,type-token-ratio,hapax-legomena,hapax-dislegomena,mean-word-length,...,exclamation_mark_rate,question-mark-rate,adverbs-rate,adjectives-rate,nouns-rate,prepositions-rate,conjuctions-rate,verb-rate,adjectivs-adverbs-rate,named-entites-count
count,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,...,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0
mean,0.78845,0.022881,0.015526,0.152269,685.5,320.5,42.367308,224.846154,45.653846,6.410582,...,0.0,0.0,0.005749,0.021353,0.033536,0.017049,0.007892,0.019728,0.027102,0.013122
std,0.020304,0.012491,0.00609,0.00529,224.735444,75.704293,5.148553,50.139559,12.148884,0.217956,...,0.0,0.0,0.002183,0.002997,0.005245,0.002947,0.00143,0.003193,0.003811,0.006929
min,0.754937,0.00307,0.006262,0.142648,315.0,188.0,32.929642,142.0,24.0,5.983278,...,0.0,0.0,0.002228,0.016281,0.024959,0.01095,0.006088,0.013967,0.020091,0.002
25%,0.771914,0.010895,0.010835,0.150059,548.25,279.75,39.353588,198.5,38.25,6.322963,...,0.0,0.0,0.004184,0.018877,0.029404,0.015124,0.006983,0.017461,0.023802,0.006582
50%,0.786976,0.024942,0.015048,0.152545,623.5,296.0,43.186935,210.5,43.5,6.402597,...,0.0,0.0,0.005537,0.021048,0.033633,0.016547,0.007737,0.018989,0.027368,0.013834
75%,0.807295,0.033247,0.020818,0.155259,783.75,373.75,45.240932,264.75,52.5,6.535432,...,0.0,0.0,0.007452,0.02313,0.037983,0.018755,0.008505,0.022389,0.030037,0.019171
max,0.821545,0.040644,0.025875,0.16262,1411.0,534.0,57.062147,333.0,80.0,6.949002,...,0.0,0.0,0.009954,0.027993,0.042011,0.022995,0.012162,0.02569,0.035491,0.022544


In [7]:
df.to_csv('Paper-features.csv')