In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [35]:
# Start a spark session
spark = SparkSession.builder.appName('Paper_qty1').getOrCreate()
# dataframe_1 = spark.read.format("csv").option("header", "true").load("group_1.csv")

In [36]:
# Read in csv
dataframe_1 = spark.read.format("csv").option("header", "true").load("group_1.csv")
dataframe_1=dataframe_1.withColumn("source_label", when(dataframe_1["label"]!="0","0" ).otherwise(dataframe_1["label"]))

dataframe_2 = spark.read.format("csv").option("header", "true").load("group_2.csv")
dataframe_2=dataframe_2.withColumn("source_label", when(dataframe_2["label"]!="1","1" ).otherwise(dataframe_2["label"]))


dataframe_3 = spark.read.format("csv").option("header", "true").load("group_3.csv")
dataframe_3=dataframe_3.withColumn("source_label", when(dataframe_3["label"]!="2","2" ).otherwise(dataframe_3["label"]))


# #combine or append the dataframes
df=dataframe_1.union(dataframe_2).union(dataframe_3)
print("Number of records: " + str(df.count()))
df.show()

Number of records: 45028
+---+--------+--------------------+--------------------+-------+-----+------------+
|_c0|    pmid|               title|            abstract|journal|label|source_label|
+---+--------+--------------------+--------------------+-------+-----+------------+
| 32|29590094|Quantized Majoran...|Majorana zero-mod...| nature|    0|           0|
| 33|29590093|The logic of sing...|Neocortical areas...| nature|    0|           0|
| 34|29590092|Itaconate is an a...|The endogenous me...| nature|    0|           0|
| 35|29590091|A new class of sy...|A challenge in th...| nature|    0|           0|
| 36|29590090|Architecture of t...|Nutrients, such a...| nature|    0|           0|
| 37|29590089|Whole-organism cl...|Embryonic develop...| nature|    0|           0|
| 38|29590088|Structure of the ...|The shape, elonga...| nature|    0|           0|
| 39|29579743|Room-temperature ...|Room-temperature ...| nature|    0|           0|
| 40|29562235|Shifts in tree fu...|Forests have a k

## Data Cleaning

In [7]:
#truncated abstract and appended to label 
df.filter(df['pmid']=='28953878').select("abstract","label","source_label").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+------------+
|abstract                                                                                                                                                                                                                                                                                                                                                                          

In [8]:
#records with Wrong labels summary
df.groupBy("label").count().show(5)

+--------------------+-----+
|               label|count|
+--------------------+-----+
| and response to ...|    1|
| the differences ...|    1|
|                SOX2|    1|
|            habitual|    1|
|     area patterning|    1|
+--------------------+-----+
only showing top 5 rows



In [9]:
df.groupBy("source_label").count().show()

+------------+-----+
|source_label|count|
+------------+-----+
|           0|13611|
|           1|14597|
|           2|16820|
+------------+-----+



In [10]:
#wrong labels
print("Total # of Wrongly labeled features: " + str(df.select('label').distinct().count()-2))
df.select('abstract','label').distinct().filter(df.label.isin(["0","1","2"])==False).show(3,truncate=False)

Total # of Wrongly labeled features: 1652
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
df.select('abstract','label','source_label').distinct().filter(df.label.isin(["0","1","2"])==False).show(3)

+--------------------+--------------------+------------+
|            abstract|               label|source_label|
+--------------------+--------------------+------------+
|"The global geody...| which implies th...|           0|
|"When deformed be...| as well as two m...|           0|
|"The synaptic mec...| and suggest that...|           1|
+--------------------+--------------------+------------+
only showing top 3 rows



In [12]:
df.columns

['_c0', 'pmid', 'title', 'abstract', 'journal', 'label', 'source_label']

In [13]:
# Create temporary view and run raw query against the view
df.createTempView("Publ") 
#Concatenate the abstract and the it truncated text in label field

df2=spark.sql("SELECT pmid,journal,title,concat(abstract,label) as abstract, source_label as label FROM Publ")
df2.show()


+--------+-------+--------------------+--------------------+-----+
|    pmid|journal|               title|            abstract|label|
+--------+-------+--------------------+--------------------+-----+
|29590094| nature|Quantized Majoran...|Majorana zero-mod...|    0|
|29590093| nature|The logic of sing...|Neocortical areas...|    0|
|29590092| nature|Itaconate is an a...|The endogenous me...|    0|
|29590091| nature|A new class of sy...|A challenge in th...|    0|
|29590090| nature|Architecture of t...|Nutrients, such a...|    0|
|29590089| nature|Whole-organism cl...|Embryonic develop...|    0|
|29590088| nature|Structure of the ...|The shape, elonga...|    0|
|29579743| nature|Room-temperature ...|Room-temperature ...|    0|
|29562235| nature|Shifts in tree fu...|Forests have a ke...|    0|
|29562233| nature|Structural insigh...|The organellar tw...|    0|
|29512654| nature|Correlated insula...|A van der Waals h...|    0|
|29512653| nature|Structure of the ...|The insulin recep...|  

In [14]:
## check the summary (its only three categories of labels)
df2.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|13611|
|    1|14597|
|    2|16820|
+-----+-----+



# TPOT Model

### Split data into training and test dataset

In [88]:
import pandas as pd
import numpy as np
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [89]:
#converting spark dataframe to pandas dataframe
df3=df2.toPandas()
df3.head()

Unnamed: 0,pmid,journal,title,abstract,label
0,29590094,nature,Quantized Majorana conductance.,Majorana zero-modes-a type of localized quasip...,0
1,29590093,nature,The logic of single-cell projections from visu...,Neocortical areas communicate through extensiv...,0
2,29590092,nature,Itaconate is an anti-inflammatory metabolite t...,The endogenous metabolite itaconate has recent...,0
3,29590091,nature,A new class of synthetic retinoid antibiotics ...,A challenge in the treatment of Staphylococcus...,0
4,29590090,nature,Architecture of the human GATOR1 and GATOR1-Ra...,"Nutrients, such as amino acids and glucose, si...",0


In [133]:
type(df3)

pandas.core.frame.DataFrame

In [135]:
df3=df3[df3['label']!='2']
df3.dropna(inplace=True)

X = df3["abstract"]#.values.reshape(-1, 1)
y = df3["label"] #.values.reshape(-1, 1)
print(X.shape, y.shape)

# data=np.array(df2.abstract)


(28183,) (28183,)


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25)


In [138]:
# transform X and y to lists for processing
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [139]:
# Tokenize dataframe
tokened = Tokenizer(inputCol="abstract", outputCol="words")
tokened_transformed = tokened.transform(X)
tokened_transformed.count()

AttributeError: 'Series' object has no attribute '_jdf'

In [137]:
tpot = TPOTClassifier(generations=2, population_size=5, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
# tpot.export('tpot_mnist_pipeline.py')

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [5]:
# Remove stop words
stop_list = ["@VirginAmerica", "$30", "@virginamerica"]
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_list)
removed_frame = remover.transform(tokened_transformed)
removed_frame.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
|Airline Tweets                                                                                                                         |words                                                                                                                                                          |filtered                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------

In [6]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2,4))

# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.show()

+--------------------+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|            filtered|        hashedValues|
+--------------------+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[plus, you've, ad...|(16,[3,4,5,7,8,9,...|
|@VirginAmerica se...|[@virginamerica, ...|[seriously, would...|(16,[0,1,2,3,4,9,...|
|@VirginAmerica do...|[@virginamerica, ...|[do, you, miss, m...|(16,[0,1,8,10,11,...|
|@VirginAmerica Ar...|[@virginamerica, ...|[are, the, hours,...|(16,[0,1,2,4,7,9,...|
|@VirginAmerica aw...|[@virginamerica, ...|[awaiting, my, re...|(16,[0,3,4,6,7,8,...|
+--------------------+--------------------+--------------------+--------------------+



In [7]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [8]:
# Display the dataframe
rescaledData.select("words", "features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                          |features                                                                                                                                                                                                             |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------

In [210]:
# Stop Spark
spark.stop()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:63101)
Traceback (most recent call last):
  File "/Users/Shemelis/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/Shemelis/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


# Naive Bayes model

### Feature Transformations Using pyspark MlLib

In [301]:
df2.show(5)

+--------+-------+--------------------+--------------------+-----+
|    pmid|journal|               title|            abstract|label|
+--------+-------+--------------------+--------------------+-----+
|29590094| nature|Quantized Majoran...|Majorana zero-mod...|    0|
|29590093| nature|The logic of sing...|Neocortical areas...|    0|
|29590092| nature|Itaconate is an a...|The endogenous me...|    0|
|29590091| nature|A new class of sy...|A challenge in th...|    0|
|29590090| nature|Architecture of t...|Nutrients, such a...|    0|
+--------+-------+--------------------+--------------------+-----+
only showing top 5 rows



In [302]:
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
df2 = df2.withColumn('length', length(df2['abstract']))
# df2.show()

from pyspark.sql.types import IntegerType
df2=df2.withColumn("label", df2["label"].cast(IntegerType()))

In [303]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
abstract_to_num = StringIndexer(inputCol='abstract',outputCol='abst')
tokenizer = Tokenizer(inputCol="abstract", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [304]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [314]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[abstract_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [315]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df2)
cleaned = cleaner.transform(df2)

Py4JJavaError: An error occurred while calling o3597.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 874.0 failed 1 times, most recent failure: Lost task 3.0 in stage 874.0 (TID 14410, localhost, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$countByKey$1.apply(PairRDDFunctions.scala:370)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.countByKey(PairRDDFunctions.scala:369)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1208)
	at org.apache.spark.rdd.RDD$$anonfun$countByValue$1.apply(RDD.scala:1208)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.countByValue(RDD.scala:1207)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:140)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# Show label of group of kournal and resulting features
cleaned.select(['label', 'features']).show(truncate=False)

In [310]:
# Break data down into a training set and a testing set
(training, testing) = cleaned.randomSplit([0.7, 0.3])

NameError: name 'cleaned' is not defined

In [309]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model and fit training data
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
spam_predictor = nb.fit(training)

NameError: name 'training' is not defined

In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting spam was: {acc}")

# Gensim Model

In [226]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Scikit learn interface for `gensim.models.phrases.Phrases`.

Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.

Examples
--------
>>> from gensim.sklearn_api.phrases import PhrasesTransformer
>>>
>>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured.
>>> m = PhrasesTransformer(min_count=1, threshold=3)
>>> texts = [
...   ['I', 'love', 'computer', 'science'],
...   ['computer', 'science', 'is', 'my', 'passion'],
...   ['I', 'studied', 'computer', 'science']
... ]
>>>
>>> # Use sklearn fit_transform to see the transformation.
>>> # Since computer and science were seen together 3+ times they are considered a phrase.
>>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0]

"""
from six import string_types
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models


class PhrasesTransformer(TransformerMixin, BaseEstimator):
    """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`.

    For more information, please have a look to `Mikolov, et. al: "Efficient Estimation of Word Representations in
    Vector Space" <https://arxiv.org/pdf/1301.3781.pdf>`_ and `Gerlof Bouma: "Normalized (Pointwise) Mutual Information
    in Collocation Extraction" <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.

    """
    def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
                 delimiter=b'_', progress_per=10000, scoring='default'):
        """

        Parameters
        ----------
        min_count : int, optional
            Terms with a count lower than this will be ignored
        threshold : float, optional
            Only phrases scoring above this will be accepted, see `scoring` below.
        max_vocab_size : int, optional
            Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control.
            The default of 40M needs about 3.6GB of RAM.
        delimiter : str, optional
            Character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int, optional
            Training will report to the logger every that many phrases are learned.
        scoring : str or function, optional
            Specifies how potential phrases are scored for comparison to the `threshold`
            setting. `scoring` can be set with either a string that refers to a built-in scoring function,
            or with a function with the expected parameter names. Two built-in scoring functions are available
            by setting `scoring` to a string:

                * 'default': Explained in `Mikolov, et. al: "Efficient Estimation of Word Representations
                  in Vector Space" <https://arxiv.org/pdf/1301.3781.pdf>`_.
                * 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation
                  Extraction" <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.

            'npmi' is more robust when dealing with common words that form part of common bigrams, and
            ranges from -1 to 1, but is slower to calculate than the default.

            To use a custom scoring function, create a function with the following parameters and set the `scoring`
            parameter to the custom function, see :func:`~gensim.models.phrases.original_scorer` as example.
            You must define all the parameters (but can use only part of it):

                * worda_count: number of occurrences in `sentences` of the first token in the phrase being scored
                * wordb_count: number of occurrences in `sentences` of the second token in the phrase being scored
                * bigram_count: number of occurrences in `sentences` of the phrase being scored
                * len_vocab: the number of unique tokens in `sentences`
                * min_count: the `min_count` setting of the Phrases class
                * corpus_word_count: the total number of (non-unique) tokens in `sentences`

            A scoring function without any of these parameters (even if the parameters are not used) will
            raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable.

        """
        self.gensim_model = None
        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.scoring = scoring

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        self.gensim_model = models.Phrases(
            sentences=X, min_count=self.min_count, threshold=self.threshold,
            max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
            progress_per=self.progress_per, scoring=self.scoring
        )
        return self

    def transform(self, docs):
        """Transform the input documents into phrase tokens.

        Words in the sentence will be joined by `self.delimiter`.

        Parameters
        ----------
        docs : {iterable of list of str, list of str}
            Sequence of documents to be used transformed.

        Returns
        -------
        iterable of str
            Phrase representation for each of the input sentences.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        if isinstance(docs[0], string_types):
            docs = [docs]
        return [self.gensim_model[doc] for doc in docs]

    def partial_fit(self, X):
        """Train model over a potentially incomplete set of sentences.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the X sentences are **added** to the vocabulary.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = models.Phrases(
                sentences=X, min_count=self.min_count, threshold=self.threshold,
                max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
                progress_per=self.progress_per, scoring=self.scoring
            )

        self.gensim_model.add_vocab(X)
        return self

ModuleNotFoundError: No module named 'gensim'

In [228]:
import gensim

ModuleNotFoundError: No module named 'gensim'