In [None]:
import os
import re
import nltk
import json
import matplotlib.pyplot as plt
import collections, functools, operator
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

In [103]:
#TEXT CORPUS CREATION
COMM_DIRECTORY = '/kaggle/input/licenses/Comm'
NONCOMM_DIRECTORY = '/kaggle/input/licenses/NonC'

stop_words = set(stopwords.words('english'))

def tokkenizer(directory):
    text_corpus = ''
    for file in os.listdir(directory):
        with open(os.path.join(directory, file)) as json_file:
            json_corpus = json.load(json_file)
            text_corpus += json_corpus['licenseText']
            text_corpus = re.sub(r"[\n,\-\=()%\\/]", ' ', text_corpus)
            text_corpus = re.sub(r" +", ' ', text_corpus)

    token_text = word_tokenize(text_corpus)      
    token_text_stop = [w for w in comm_tokens if not w.lower() in stop_words]
    return token_text, token_text_stop, text_corpus

comm_tokens, comm_tokens_stop, comm_corpus = tokkenizer(COMM_DIRECTORY)
noncomm_tokens, noncomm_tokens_stop, noncomm_corpus = tokkenizer(NONCOMM_DIRECTORY)

In [104]:
#SENTIMENT ANALYSIS
def sentiment_analizer(corpus, text):
    sia = SentimentIntensityAnalyzer()
    comercial_corpus_sent = corpus.split('.')
    scores = list(map(lambda x: sia.polarity_scores(x), comercial_corpus_sent))
    result = dict(functools.reduce(operator.add, map(collections.Counter, scores)))
    result = {key: value / len(scores) for key, value in result.items()}
    print(text)
    print(result)

sentiment_analizer(comercial_corpus, 'COMMERCIONAL')
sentiment_analizer(noncomercial_corpus, 'NONCOMMERCIONAL')

COMMERCIONAL
{'neu': 0.7376922911514888, 'pos': 0.06345812278249664, 'compound': 0.13032507257284062, 'neg': 0.021120309644124344}
NONCOMMERCIONAL
{'neu': 0.7052508709067419, 'pos': 0.05898312929232629, 'compound': 0.11334192793868764, 'neg': 0.022417487807305576}


In [None]:
def create_word_cloud(text):
    comm_wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white', stopwords={''},
                    min_font_size = 10).generate(' '.join(text))

    # plot the WordCloud image                      
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(comm_wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

create_word_cloud(comm_tokens)
create_word_cloud(noncomm_tokens)

In [None]:
stop_words_modi = {'the', ',', 'of', '.', '-', 'to', 'this', 'in', 'that', 'a', '(', ')'}

In [149]:
comm_most_common = nltk.FreqDist(w.lower() for w in comm_tokens)
noncomm_most_common = nltk.FreqDist(w.lower() for w in noncomm_tokens)
comm_top = comm_most_common.most_common(100)
noncomm_top = noncomm_most_common.most_common(100)

comm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), comm_top))
noncomm_only_words = list(map(lambda x: re.sub("[0-9(),' \"]",'' ,str(x)), noncomm_top))

diff = list(set(comm_only_words) - set(noncomm_only_words))
diff_comparision = [[w, comm_most_common[w], noncomm_most_common[w]] for w in diff]

print(diff)
print()
print(diff_comparision)

['sections', 'following', 'limitation', 'library', 'patent', 'copies', 's', 'do']

[['sections', 291, 556], ['following', 279, 541], ['limitation', 286, 554], ['library', 369, 339], ['patent', 469, 506], ['copies', 322, 575], ['s', 138, 331], ['do', 322, 537]]
