In [1]:
from pycaret.classification import *
import pandas as pd
import operator
from networkit.graphtools import GraphTools
from itertools import combinations, chain
import networkx as nx
import networkit as nk
import matplotlib.pyplot as plt
import string
from itertools import combinations, chain
import networkx as nx
import networkit as nk
import matplotlib.pyplot as plt
import string
import numpy as np

In [2]:
class GraphAnalyzer:
    def __init__(self):
        pass

    def sentences_to_graph(self, text):
        sentences = text.replace('-', ' ').split('.')
        list_of_words = [ [ word.strip(string.punctuation).lower() for word in sentence.split(' ') if word != '' ] for sentence in sentences ]
        # print(list_of_words)
        links_per_sentence = list([ list(combinations(sentence, 2)) for sentence in list_of_words ])
        links = list(chain(*links_per_sentence))
        nodes = set(chain(*list_of_words)) - set([''])

        G = nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(links)
        G.remove_edges_from(nx.selfloop_edges(G))
        G_nk = nk.nxadapter.nx2nk(G)
        idmap = dict((u, id) for (id, u) in zip(G.nodes(), range(G.number_of_nodes())))
        return G, G_nk, idmap

    def stats(self, arr):
        return {
            'min': np.min(arr), 
            '1qr': np.percentile(arr, 25), 
            'median': np.median(list(arr)), 
            '3qr': np.percentile(arr, 75), 
            'max': np.max(arr),
            'avg': np.mean(arr)
        }

    def get_density(self, G):
        return GraphTools.density(G)

    def get_degree_centrality(self, G):
        deg = nk.centrality.DegreeCentrality(G)
        deg.run()
        return deg.scores()

    def get_eig_centrality(self, G):
        deg = nk.centrality.EigenvectorCentrality(G)
        deg.run()
        return deg.scores()

    def get_pagerank(self, G):
        deg = nk.centrality.PageRank(G)
        deg.run()
        return deg.scores()

    def get_btw_centrality(self, G):
        deg = nk.centrality.ApproxBetweenness(G)
        deg.run()
        return deg.scores()

    def get_cls_centrality(self, G):
        deg = nk.centrality.ApproxBetweenness(G)
        deg.run()
        return deg.scores()

    def get_size(self, G):
        return GraphTools.size(G)

    def get_max_cliques(self, G):
        mc = nk.clique.MaximalCliques(G)
        mc.run()
        return mc.getCliques()

    def get_isolated_nodes(self, degrees):
        sequence = sorted(degrees)
        i = 0
        nIsolated = 0
        while i < len(sequence) and sequence[i] == 0:
            nIsolated += 1
            i += 1
        return nIsolated

    def get_connected_components(self, G):
        cc = nk.components.ConnectedComponents(G)
        scc = nk.components.StronglyConnectedComponents(G)
        cc.run()
        scc.run()
        return cc.numberOfComponents(), scc.numberOfComponents()
    def prefaced_dict(self, d, prefix):
        return { prefix + str(key): value for key, value in d.items() }
    # Add other graph analysis methods here
    def get_globals(self, G):
        nodes, edges = self.get_size(G)
        cc, scc = self.get_connected_components(G)

        return {
            'nodes': nodes,
            'edges': edges,
            'density': self.get_density(G),
            'isolated_nodes': self.get_isolated_nodes(self.get_degree_centrality(G)),
            'core_number': max(nx.core_number(nk.nxadapter.nk2nx(G)).items(), key = operator.itemgetter(1))[1],
            'global_cc': nk.globals.ClusteringCoefficient.exactGlobal(G),
            'approx_avg_local_cc': nk.globals.clustering(G),
            'max_cliques': len(self.get_max_cliques(G)),
            'connected_components': cc,
            'strongly_connected_components': scc,
            'degree_assortativity_coefficient': nx.degree_assortativity_coefficient(nk.nxadapter.nk2nx(G)),
            'degree_pearson_correlation_coefficient': nx.degree_pearson_correlation_coefficient(nk.nxadapter.nk2nx(G))
        }


    def analyze_graph(self, text):
        G, G_nk, idmap = self.sentences_to_graph(text)
        Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
        G0 = G.subgraph(Gcc[0])
        features = {
            **self.prefaced_dict(self.get_globals(G_nk), ''),
            'avg_shortest_path_length': nx.average_shortest_path_length(G0),
            'avg_neighbour_degree': np.mean(list((nx.average_neighbor_degree(G).values()))),
            'avg_degree_connectivity': np.mean(list((nx.average_degree_connectivity(G).values()))),
            **self.prefaced_dict(self.stats(self.get_degree_centrality(G_nk)), 'degree_centrality_'),
            **self.prefaced_dict(self.stats(self.get_eig_centrality(G_nk)), 'eigenvector_centrality_'),
            **self.prefaced_dict(self.stats(self.get_pagerank(G_nk)), 'pagerank_'),
            **self.prefaced_dict(self.stats(self.get_btw_centrality(G_nk)), 'betweenness_centrality_'),
            **self.prefaced_dict(self.stats(self.get_cls_centrality(G_nk)), 'closeness_centrality_'),
            'check_planarity': nx.check_planarity(G)[0],
        }

        return pd.DataFrame([features])


In [28]:
class TextClassifier:
    def __init__(self, model_path_2000):
        # self.model_200 = load_model('../models/lightgbm')
        self.model_2000 = load_model('../models/lightgbm_2000')
        self.graph_analyzer = GraphAnalyzer()

    def predict_label(self, text):
        df_features = self.graph_analyzer.analyze_graph(text)
        # prediction_200 = list(predict_model(self.model_200,data=df_features).iloc[:,-2])[0]
        prediction_2000 =list(predict_model(self.model_2000,data=df_features).iloc[:,-2])[0]

        return  prediction_2000



In [4]:
# Constants

TAACO_DIR = '../taaco/'
INPUT_DIR = '../taaco/inputs/'
OUTPUT_DIR = '../taaco/outputs/'


dfs = []
tags = ['init', 'generation']
filenames = ['../data/ieee-init.jsonl', '../data/ieee-chatgpt-generation.jsonl']

for filename in filenames:
  dfs.append(pd.read_json(filename, lines=True))

# Human-written
# print(f"Title: { dfs[0]['title'][1] } Keywords: {dfs[0]['keyword'][1]}")
  

In [9]:
len(dfs[0])

15395

In [29]:
text_classifier = TextClassifier('../models/lightgbm_2000')

Transformation Pipeline and Model Successfully Loaded


In [15]:
# model = create_model('lightgbm')
# # Save the model
# save_model(model, '../models/lightgbm_2000')
# model = load_model('../models/lightgbm_2000')

In [None]:
# Assuming you want to classify a specific text
row = dfs[0][dfs[0]['id'] == 8619053]
correct_prediction_2000 = 0
for i in range(1001,2000):
    print(i, correct_prediction_2000)
    text_to_classify = dfs[0]['abstract'][i]
    #print(text_to_classify)
    prediction_2000 = text_classifier.predict_label(text_to_classify)
    if prediction_2000 == 'human':
        correct_prediction_2000 += 1   
correct_prediction_2000
    # print(f"Prediction for 200: {prediction_200}")
    # print(f"Prediction for 2000: {prediction_2000}")

In [22]:
print(correct_prediction_200,correct_prediction_2000)
print(sum([1 for i in range(1001,2000)]))

821 821
999


In [None]:
correct_prediction_2000 = 0
for i in range(2001,3000):
    print(i, correct_prediction_2000)
    text_to_classify = dfs[1]['abstract'][i]
    #print(text_to_classify)
    prediction_2000 = text_classifier.predict_label(text_to_classify)
    if prediction_2000 == 'ai':
        correct_prediction_2000 += 1   
correct_prediction_2000

In [33]:
print(sum([1 for i in range(2001,3000)]))

999


In [55]:
text_to_classify = "Knowledge-intensive crowdsourcing harnesses the collective intelligence of widely distributed participants to address intricate problems demanding diverse expertise. This research delves into the factors shaping participant estimation within knowledge-intensive crowdsourcing environments, with a keen focus on identifying pivotal characteristics that profoundly influence the accuracy and reliability of estimations.Employing empirical analysis and data-driven methodologies, this study scrutinizes the interplay between participants' backgrounds, skills, and prior experiences, and their impact on estimation accuracy. The research systematically explores the influence of domain expertise, cognitive abilities, and collaborative tendencies on participants' estimation performance. Additionally, socio-demographic factors are considered to comprehensively grasp the broader contextual influences on estimation outcomes.The findings from this study significantly contribute to a nuanced comprehension of the human factors that underpin estimation accuracy in knowledge-intensive crowdsourcing. These insights have practical implications, informing the development of adept participant selection mechanisms, tailored training programs, and collaborative frameworks designed to elevate the overall success of crowdsourced projects within domains necessitating heightened levels of expertise.This research serves as a crucial bridge between knowledge-intensive crowdsourcing, human factors, and estimation accuracy, offering valuable insights to researchers, practitioners, and platform designers. The aim is to empower them in optimizing the performance of crowdsourced solutions within intricate knowledge domains."

In [56]:
prediction_2000 = text_classifier.predict_label(text_to_classify)

In [57]:
prediction_2000

'human'

In [62]:
text_to_classify =dfs[1]['abstract'][5000]
prediction_2000 = text_classifier.predict_label(text_to_classify)
prediction_2000

'ai'

In [63]:
dfs[1]['title'][5000]

'Sound Source Separation Using Spatio-temporal Sound Pressure Distribution Images and Machine Learning'