In [13]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

Matplotlib is building the font cache; this may take a moment.


In [14]:
embeddings_dict = {}
with open("data/glove/glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

In [15]:
def find_closest_embeddings(embedding, cutoff=25):
    return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

In [16]:
print(find_closest_embeddings(
    embeddings_dict["twig"] - embeddings_dict["branch"] + embeddings_dict["hand"]
)[:5])

['flashlight', 'twig', 'clipboard', 'shove', 'hand']


In [46]:
sentence_a = "Here we perform 5-fold cross validation of a KNN model after using a standard scaler"
sentence_b = "In this kernel I present a very simple K-nearest neighbors model based on the quantiles of the distribution"
sentence_c = "And so it begins but you can't have them all. Your heart has to settle down somewhere"

In [47]:
embedding_a = np.zeros_like(embeddings_dict["branch"])
embedding_b = np.zeros_like(embeddings_dict["branch"])
embedding_c = np.zeros_like(embeddings_dict["branch"])

In [48]:
for w in sentence_a.split():
    if w not in embeddings_dict:
        continue
    embedding_a += embeddings_dict[w]

for w in sentence_b.split():
    if w not in embeddings_dict:
        continue
    embedding_b += embeddings_dict[w]

for w in sentence_c.split():
    if w not in embeddings_dict:
        continue
    embedding_c += embeddings_dict[w]

In [50]:
print(find_closest_embeddings(
    embedding_a
)[:5])

['a', 'an', 'this', 'be', 'for']


In [54]:
spatial.distance.euclidean(embedding_a, embedding_b)

23.454050064086914

In [52]:
spatial.distance.euclidean(embedding_a, embedding_c)

32.27524185180664

In [53]:
spatial.distance.euclidean(embedding_b, embedding_c)

30.673566818237305

In [None]:
spatial.distance.euclidean(embeddings_dict[token], embedding))

In [None]:
return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

# bert

In [1]:
! pip install -U sentence-transformers
# https://www.sbert.net/

Collecting sentence-transformers
  Using cached sentence-transformers-2.2.1.tar.gz (84 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.20.1-py3-none-any.whl (4.4 MB)
Collecting torch>=1.6.0


In [4]:
! pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting torch
  Using cached https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp37-cp37m-linux_x86_64.whl (169.1 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cpu/torchvision-0.12.0%2Bcpu-cp37-cp37m-linux_x86_64.whl (14.7 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cpu/torchaudio-0.11.0%2Bcpu-cp37-cp37m-linux_x86_64.whl (2.7 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-1.11.0+cpu torchaudio-0.11.0+cpu torchvision-0.12.0+cpu
[0m

In [10]:
! pip install lexrank

Collecting lexrank
  Downloading lexrank-0.1.0-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.8/69.8 kB[0m [31m933.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting urlextract>=0.7
  Downloading urlextract-1.6.0-py3-none-any.whl (20 kB)
Collecting path.py>=10.5
  Downloading path.py-12.5.0-py3-none-any.whl (2.3 kB)
Collecting uritools
  Downloading uritools-4.0.0-py3-none-any.whl (10 kB)
Installing collected packages: uritools, urlextract, path.py, lexrank
Successfully installed lexrank-0.1.0 path.py-12.5.0 uritools-4.0.0 urlextract-1.6.0
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import json
import inspect
from pathlib import Path
import pylev
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer, util
import torch

In [4]:
"""
LexRank implementation
Source: https://github.com/crabcamp/lexrank/tree/dev
"""

import numpy as np
from scipy.sparse.csgraph import connected_components
from scipy.special import softmax
import logging

logger = logging.getLogger(__name__)

def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (
        threshold is None
        or isinstance(threshold, float)
        and 0 <= threshold < 1
    ):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores


def _power_method(transition_matrix, increase_power=True, max_iter=10000):
    eigenvector = np.ones(len(transition_matrix))

    if len(eigenvector) == 1:
        return eigenvector

    transition = transition_matrix.transpose()

    for _ in range(max_iter):
        eigenvector_next = np.dot(transition, eigenvector)

        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next

        eigenvector = eigenvector_next

        if increase_power:
            transition = np.dot(transition, transition)

    logger.warning("Maximum number of iterations for power method exceeded without convergence!")
    return eigenvector_next


def connected_nodes(matrix):
    _, labels = connected_components(matrix)

    groups = []

    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)

    return groups


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    # normalize probability distribution differently if we have negative transition values
    if np.min(weights_matrix) <= 0:
        return softmax(weights_matrix, axis=1)

    return weights_matrix / row_sum


def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1

    return create_markov_matrix(discrete_weights_matrix)


def stationary_distribution(
    transition_matrix,
    increase_power=True,
    normalized=True,
):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)

    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector

    if normalized:
        distribution /= n_1

    return distribution

In [7]:
import nltk
# nltk.download('punkt')
from sentence_transformers import SentenceTransformer, util
import numpy as np



model = SentenceTransformer('all-MiniLM-L6-v2')

# Our input document we want to summarize
# As example, we take the first section from Wikipedia
document = """
New York City (NYC), often called simply New York, is the most populous city in the United States. With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass. With almost 20 million people in its metropolitan statistical area and approximately 23 million in its combined statistical area, it is one of the world's most populous megacities. New York City has been described as the cultural, financial, and media capital of the world, significantly influencing commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.
Situated on one of the world's largest natural harbors, New York City is composed of five boroughs, each of which is a county of the State of New York. The five boroughs—Brooklyn, Queens, Manhattan, the Bronx, and Staten Island—were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world as of 2016. As of 2019, the New York metropolitan area is estimated to produce a gross metropolitan product (GMP) of $2.0 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York is home to the highest number of billionaires of any city in the world.
New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. The city was regained by the Dutch in July 1673 and was subsequently renamed New Orange for one year and three months; the city has been continuously named New York since November 1674. New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the U.S. by ship in the late 19th and early 20th centuries, and is a symbol of the U.S. and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity, entrepreneurship, and environmental sustainability, and as a symbol of freedom and cultural diversity. In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.
Many districts and landmarks in New York City are well known, including three of the world's ten most visited tourist attractions in 2013. A record 62.8 million tourists visited New York City in 2017. Times Square is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. Many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. Providing continuous 24/7 service and contributing to the nickname The City that Never Sleeps, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. The city has over 120 colleges and universities, including Columbia University, New York University, Rockefeller University, and the City University of New York system, which is the largest urban public university system in the United States. Anchored by Wall Street in the Financial District of Lower Manhattan, New York City has been called both the world's leading financial center and the most financially powerful city in the world, and is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.
"""

#Split the document into sentences
sentences = nltk.sent_tokenize(document)
print("Num sentences:", len(sentences))

#Compute the sentence embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute the pair-wise cosine similarities
cos_scores = util.cos_sim(embeddings, embeddings).numpy()

#Compute the centrality for each sentence
centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

#We argsort so that the first element is the sentence with the highest score
most_central_sentence_indices = np.argsort(-centrality_scores)


#Print the 5 sentences with the highest scores
print("\n\nSummary:")
for idx in most_central_sentence_indices[0:5]:
    print(sentences[idx].strip())

Num sentences: 29


Summary:
New York City (NYC), often called simply New York, is the most populous city in the United States.
In 2019, New York was voted the greatest city in the world per a survey of over 30,000 people from 48 cities worldwide, citing its cultural diversity.
Located at the southern tip of the U.S. state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass.
With an estimated 2019 population of 8,336,817 distributed over about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States.
New York City was the capital of the United States from 1785 until 1790, and has been the largest U.S. city since 1790.


In [9]:
def read_train_data(data_dir, NUM_TRAIN = 10000):
    def read_notebook(path):
        return (
            pd.read_json(
                path,
                dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)  # final path component
            .rename_axis('cell_id')
        )

    paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
    notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
    ]
    df = (
      pd.concat(notebooks_train)
      .set_index('id', append=True)
      .swaplevel()
      .sort_index(level='id', sort_remaining=False)
    )
    return df

In [11]:
data_dir = Path('data/')

In [12]:
df = read_train_data(data_dir, NUM_TRAIN=10000)

Train NBs: 100%|██████████| 10000/10000 [02:35<00:00, 64.31it/s]


In [None]:
df.reset_index().

In [4]:
pip install texthero

Collecting texthero
  Using cached texthero-1.1.0-py3-none-any.whl (24 kB)
Collecting spacy<3.0.0
  Using cached spacy-2.3.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.4 MB)
Collecting pandas>=1.0.2
  Using cached pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
Collecting wordcloud>=1.5.0
  Downloading wordcloud-1.8.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.2/435.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting gensim<4.0,>=3.6.0
  Using cached gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
Collecting unidecode>=1.1.1
  Using cached Unidecode-1.3.4-py3-none-any.whl (235 kB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.0.0-py3-none-any.whl (58 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [16]:
import plotly

In [17]:
import texthero as hero
import pandas as pd


df = pd.read_csv(
    "https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)

df['tfidf'] = (
    df['text']
    .pipe(hero.clean)
    .pipe(hero.tfidf)
)

df['kmeans_labels'] = (
    df['tfidf']
    .pipe(hero.kmeans, n_clusters=5)
    .astype(str)
)

df['pca'] = df['tfidf'].pipe(hero.pca)

In [40]:
f = hero.scatterplot(df, 'pca', color='kmeans_labels', title="K-means BBC Sport news", return_figure=True)

In [4]:
text="### How many classes do each image have? .## images with defect(contain 3 type label) .* There are similar numbers of images with and without defects.\n* class is imbalanced .# Let's visualization masks! .## images with defect(label: 4) .### Start Classification Process .## check image data\n### image size .## images with defect(label: 1) .## images with defect(label: 2) .#### Augmentation functions in one place . Will use later . Not now .#### Images with normal mask and Ben's processing .## images with defect(contain multi label) .#### Multiple Images with Ben's preprocessing . Reference :http://faculty.neu.edu.cn/yunhyan/NEU_surface_defect_database.html\n \n In the Northeastern University (NEU) surface defect database, six kinds of typical surface defects of the hot-rolled steel strip are collected, i.e., rolled-in scale (RS), patches (Pa), crazing (Cr), pitted surface (PS), inclusion (In) and scratches (Sc). \n #### At a first look it seems for our images : \n1. Class 1 : Inclusion\n2. Class 2: Pitted\n3. Class 3 : Scratches \n4. Class 4 : Patches . \n\n\n\nHowever I might be wrong :) . \n![image.png](attachment:image.png)\n\n\n .## Note : This Kernel is a Fork from the amazing Kernel below . So please upvote the original Kernel . I have started adding few information and preprocessing into this on my own . \nhttps://www.kaggle.com/go1dfish/clear-mask-visualization-and-simple-eda .## images with defect(label: 3) .* All image have same shape, (1600, 256). .#### Start with binary classification here .## About The Competition : Detecting Steel Defect \n\nSteel is one of the most important building materials of modern times. Steel buildings are resistant to natural and man-made wear which has made the material ubiquitous around the world. To help make production of steel more efficient, this competition will help identify defects.\n\n\nSeverstal is leading the charge in efficient steel mining and production. They believe the future of metallurgy requires development across the economic, ecological, and social aspects of the industry—and they take corporate responsibility seriously. The company recently created the country’s largest industrial data lake, with petabytes of data that were previously discarded. Severstal is now looking to machine learning to improve automation, increase efficiency, and maintain high quality in their production.\n\nThe production process of flat sheet steel is especially delicate. From heating and rolling, to drying and cutting, several machines touch flat steel by the time it’s ready to ship. Today, Severstal uses images from high frequency cameras to power a defect detection algorithm.\n\nIn this competition, you’ll help engineers improve the algorithm by localizing and classifying surface defects on a steel sheet.\n\nIf successful, you’ll help keep manufacturing standards for steel high and enable Severstal to continue their innovation, leading to a stronger, more efficient world all around us. .## import modules and define models .* almost image have no defect or one kind of defect"

In [1]:
import pke