In [3]:
# conda install openjdk
# conda install pyspark
# conda install -c conda-forge findspark

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StandardScaler, Word2Vec, StopWordsRemover, VectorAssembler, StringIndexer, HashingTF, IDF, Tokenizer
import pyspark.sql.functions as f
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



spark = SparkSession \
    .builder \
    .appName("HTFIDF") \
    .master("local[*]") \
    .config("spark.executor.memory", "70g") \
    .config("spark.driver.memory", "50g") \
    .config("spark.memory.offHeap.enabled",True) \
    .config("spark.memory.offHeap.size","16g") \
    .getOrCreate()

sc = spark.sparkContext

df = spark.read.option("header",True).option("quote", "\"").option("escape", "\"").option("multiline", True).csv("data/train.csv")

df.printSchema()



root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: string (nullable = true)
 |-- severe_toxic: string (nullable = true)
 |-- obscene: string (nullable = true)
 |-- threat: string (nullable = true)
 |-- insult: string (nullable = true)
 |-- identity_hate: string (nullable = true)



In [None]:
#Приведем к нижнему регистру, а затем токенизуем и удалим стоп слова

new_df = df.withColumn('comment_text', f.lower(f.col('comment_text')))

tokenizer = Tokenizer(inputCol='comment_text', outputCol='words')
new_df = tokenizer.transform(new_df)
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
new_df = remover.transform(new_df)


#пайплайн
NUM_FEATURES = 10

hashing = HashingTF(inputCol='filtered', outputCol='raw_features', numFeatures=NUM_FEATURES)
idf = IDF(inputCol='raw_features', outputCol='features')
label_stringIdx = StringIndexer(inputCol = "toxic", outputCol = "label")
lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages= [hashing, idf, label_stringIdx, lr])
model = pipeline.fit(new_df)

rescale_df = model.transform(new_df)
rescale_df.select('label', 'prediction').show(truncate=False)

In [None]:
param_grid = ParamGridBuilder().addGrid(hashing.numFeatures, 
                                        [10, 20, 50, 100, 200, 500, 1000]).build()

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, 
                    evaluator=BinaryClassificationEvaluator(), seed=42)

model = cv.fit(new_df)

params = [{p.name: v for p, v in m.items()} for m in model.getEstimatorParamMaps()]

pd.DataFrame.from_dict([
    {model.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, model.avgMetrics)])

## TF_IDF

In [None]:
def tf_idf_cv(target_name, df = df, num_feature_list = [10, 20, 50, 100, 200, 500, 1000]):
    
    new_df = df.withColumn('comment_text', f.lower(f.col('comment_text')))
    tokenizer = Tokenizer(inputCol='comment_text', outputCol='words')
    new_df = tokenizer.transform(new_df)
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    new_df = remover.transform(new_df)
    
    hashing = HashingTF(inputCol='filtered', outputCol='raw_features', numFeatures=10)
    idf = IDF(inputCol='raw_features', outputCol='features')
    label_stringIdx = StringIndexer(inputCol = target_name, outputCol = "label")
    lr = LogisticRegression(maxIter=10)

    pipeline = Pipeline(stages= [hashing, idf, label_stringIdx, lr])
    param_grid = ParamGridBuilder().addGrid(hashing.numFeatures, 
                                            num_feature_list).build()
    
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, 
                        evaluator=BinaryClassificationEvaluator(), seed=42)
    model = cv.fit(new_df)
    params = [{p.name: v for p, v in m.items()} for m in model.getEstimatorParamMaps()]
    
    output = pd.DataFrame.from_dict([{model.getEvaluator().getMetricName(): metric, **ps} 
        for ps, metric in zip(params, model.avgMetrics)])
    
    return output


In [None]:
toxic = tf_idf_cv('toxic')
print('toxic done')

obscene = tf_idf_cv('obscene')
print('obscene done')

threat = tf_idf_cv('threat')
print('threat done')

insult = tf_idf_cv('insult')
print('insult done')

identity = tf_idf_cv('identity_hate')
print('identity_hate done')


In [None]:
toxic['target'] = "toxic"
obscene['target'] = "obscene"
threat['target'] = "threat"
insult['target'] = "insult"
identity['target'] = "identity_hate"


pd.pivot_table(pd.concat([toxic, obscene, threat, insult, identity]),
               values='areaUnderROC', index='numFeatures',columns=['target']).reset_index()

## Word2Vec


In [None]:
def w2v_cv(target_name, df = df, VECTORSIZE = 3):
    
    new_df = df.withColumn('comment_text', f.lower(f.col('comment_text')))
    tokenizer = Tokenizer(inputCol='comment_text', outputCol='words')
    new_df = tokenizer.transform(new_df)
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    new_df = remover.transform(new_df)
    
    word2Vec = Word2Vec(vectorSize=VECTORSIZE, minCount=0, inputCol='filtered', outputCol='features')
    label_stringIdx = StringIndexer(inputCol = target_name, outputCol = "label")
    lr = LogisticRegression(maxIter=10)

    pipeline = Pipeline(stages= [word2Vec, label_stringIdx, lr])
    param_grid = ParamGridBuilder().addGrid(word2Vec.vectorSize, [VECTORSIZE]).build()
    
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, 
                        evaluator=BinaryClassificationEvaluator(), seed=42)
    model = cv.fit(new_df)
    
    params = [{p.name: v for p, v in m.items()} for m in model.getEstimatorParamMaps()]
    
    output = pd.DataFrame.from_dict([{model.getEvaluator().getMetricName(): metric, **ps} 
        for ps, metric in zip(params, model.avgMetrics)])
    
    return output

In [None]:
toxic_w2v = w2v_cv('toxic')
print('toxic_w2v done')

obscene_w2v = w2v_cv('obscene')
print('obscene_w2v done')

threat_w2v = w2v_cv('threat')
print('threat_w2v done')

insult_w2v = w2v_cv('insult')
print('insult_w2v done')

identity_hate_w2v = w2v_cv('identity_hate')
print('identity_hate_w2v done')


In [None]:
toxic_w2v['target'] = "toxic"
obscene_w2v['target'] = "obscene"
threat_w2v['target'] = "threat"
insult_w2v['target'] = "insult"
identity_hate_w2v['target'] = "identity_hate"

pd.pivot_table(pd.concat([toxic_w2v, obscene_w2v, threat_w2v, insult_w2v, identity_hate_w2v]),
               values='areaUnderROC', index='numFeatures',columns=['target']).reset_index()