In [1]:
import pandas as pd, re, nltk, string
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('G:/Extractive-Summarisation-of-German-Wikipedia/dataset/data_train.csv', encoding='utf-8')

In [3]:
stopword_list = nltk.corpus.stopwords.words('german')

def remove_stopwords(text):
    filtered_words = [word for word in nltk.word_tokenize(text) if word not in stopword_list]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

df["source"] = df["source"].apply(remove_stopwords)

In [4]:
from nltk.corpus import stopwords
german_stop_words = stopwords.words('german')

cv = CountVectorizer(max_df=0.85,stop_words=german_stop_words, max_features=10000)
word_count_vector = cv.fit_transform(df['source'])


In [5]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [6]:
tfidf_transformer.idf_

array([5.49006738, 2.90718032, 5.39087558, ..., 6.64081768, 6.76036283,
       6.99548257])

In [7]:
feature_names=cv.get_feature_names()

In [8]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [9]:
#doc = df['source'][0]
def keyw(df):
    kljuc = []
    tf_idf_vector=tfidf_transformer.transform(cv.transform([df]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items,10)
    for k in keywords:
        kljuc.append(k)
    return kljuc

df['keywords'] = df['source'].apply(keyw)

In [None]:
print(df['source'][56])

In [10]:
def keywords_inSent(df):
    text = []
    sentance= sent_tokenize(df, language="german")
    for s in sentance:
        words = word_tokenize(s, language="german")
        print(words)
        text.append(len(set.intersection(set(words), set(df['keywords']))))
    return text

print(keywords_inSent(df['source'][0]))

['Minghella', 'Sohn', 'italienisch-schottischer', 'Eltern', ',', 'Isle', 'of', 'Wight', 'Fabrik', 'für', 'Eiscreme', 'betrieben', '.']


TypeError: string indices must be integers

In [13]:
#doc = df['source'][2568]
def get_kew(df):
    kw=[]
    tf_idf_vector=tfidf_transformer.transform(cv.transform([df]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names,sorted_items,10)
    for k in keywords:
        kw.append(k)
    return kw

In [11]:
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([df['source'][idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

In [14]:
df['keywords'] = df['source'].apply(get_kew)
df.head()

Unnamed: 0,source,summary,keywords
0,Minghella Sohn italienisch-schottischer Eltern...,"Anthony Minghella, CBE war ein britischer Film...","[oscar, regie, opera, bbc, film, bestes, ferns..."
1,Ende 1940er Jahre wurde erste Auteur-Theorie f...,Die Auteur-Theorie ist eine Filmtheorie und di...,"[theorie, film, filme, autor, regisseur, ansat..."
2,"Al Pacino , geboren Manhattan , Sohn Salvatore...","Alfredo James ""Al"" Pacino ist ein US-amerikani...","[theatre, al, new, the, theaterstu, richard, y..."
3,Der Name Alkalimetalle leitet arabischen Wort ...,Als Alkalimetalle werden die chemischen Elemen...,"[sungen, wasser, eigenschaften, reaktion, lo, ..."
4,Die Arbeit bereits seit Altertum Gegenstand re...,Das deutsche Arbeitsrecht ist ein Rechtsgebiet...,"[geregelt, ra, gewerkschaften, bgb, betrieben,..."


In [None]:
idx=120
keywords=get_keywords(idx)
print(keywords)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
def no_keywords(df):
    sent = sent_tokenize(df, language="german")
    for s in sent:
        words

df[['word_overlap', 'overlap_count']] = df.apply(no_keywords, axis=1, raw=True).apply(pd.Series)

In [None]:
print(len(sent_tokenize(df['source'][0])))

In [None]:
text = []
sentance= sent_tokenize(df['source'][0], language="german")
for s in sentance:
    words = word_tokenize(s, language="german")
    print([sum(set.intersection(set(w), set(df['keywords'][0]))) for w in words])
    text.append([sum(set.intersection(set(w), set(df['keywords'][0]))) for w in words])
print(len(text))

In [15]:
def keywords_inSent(df, keywords):
    text=[]
    sentance= sent_tokenize(df, language="german")
    for s in sentance:
        s = s.translate(str.maketrans('', '', string.punctuation))
        text.append(list(len(set(s.split()).intersection(set(keywords)))))
    return text
df["keyw_in_sent"] = df.apply(lambda x: keywords_inSent(x["source"], x['keywords']), axis=1)

TypeError: 'int' object is not iterable

In [16]:
def capital_lWords(df):
    tokens = sent_tokenize(df, language="german")
    return [sum([c.isupper() for c in a]) for a in tokens]

df['noCap_LetterWords_inSentence'] = df['source'].apply(capital_lWords)

In [None]:
print(df['keywords'][0])

In [17]:
df.head()

Unnamed: 0,source,summary,keywords,noCap_LetterWords_inSentence
0,Minghella Sohn italienisch-schottischer Eltern...,"Anthony Minghella, CBE war ein britischer Film...","[oscar, regie, opera, bbc, film, bestes, ferns...","[7, 6, 1, 4, 9, 9, 10, 8, 17, 37, 11, 7, 12, 7..."
1,Ende 1940er Jahre wurde erste Auteur-Theorie f...,Die Auteur-Theorie ist eine Filmtheorie und di...,"[theorie, film, filme, autor, regisseur, ansat...","[10, 7, 7, 6, 4, 3, 5, 5, 8, 2, 3, 8, 13, 6, 6..."
2,"Al Pacino , geboren Manhattan , Sohn Salvatore...","Alfredo James ""Al"" Pacino ist ein US-amerikani...","[theatre, al, new, the, theaterstu, richard, y...","[15, 3, 13, 8, 7, 2, 3, 17, 8, 4, 11, 3, 10, 1..."
3,Der Name Alkalimetalle leitet arabischen Wort ...,Als Alkalimetalle werden die chemischen Elemen...,"[sungen, wasser, eigenschaften, reaktion, lo, ...","[8, 7, 2, 3, 5, 2, 2, 2, 6, 3, 4, 2, 5, 3, 5, ..."
4,Die Arbeit bereits seit Altertum Gegenstand re...,Das deutsche Arbeitsrecht ist ein Rechtsgebiet...,"[geregelt, ra, gewerkschaften, bgb, betrieben,...","[5, 5, 5, 8, 3, 3, 6, 8, 6, 7, 8, 6, 4, 8, 5, ..."


In [None]:
def f(columns):
    f_desc, f_def = columns[0], columns[1]
    common = set(f_desc.split()).intersection(set(f_def.split()))
    return len(common)

In [None]:
df['overlap_count'] = df.apply(f, axis=1, raw=True).apply(pd.Series)

In [None]:
df.head()

In [None]:
import string
text2 = df['summary'][0]
text2 = text2.translate(str.maketrans('', '', string.punctuation))
tokens = sent_tokenize(df['source'][0], language = "german")
tok2 = word_tokenize(text2, language = "german")
print(len(set(word_tokenize(tokens[0], language="german")).intersection(set(tok2))))

In [None]:
print(tok2)

In [None]:
def W_sourceSummary(df, df2):
    f = []
    text2 = df2
    text2 = text2.translate(str.maketrans('', '', string.punctuation))
    tokens = sent_tokenize(df, language = "german")
    tok2 = word_tokenize(text2, language = "german")
    for s in tokens:
        f.append(len((set(s.split()).intersection(set(tok2)))))
    return f

In [None]:
def W_sourceKeywords(df, df2):
    f = []
    df.lower()
    text2 = df2
    text2 = text2.translate(str.maketrans('', '', string.punctuation))
    tokens = sent_tokenize(df, language = "german")
    tok2 = word_tokenize(text2, language = "german")
    for s in tokens:
        f.append(len((set(s.split()).intersection(set(tok2)))))
    return f

def listToString(df):
    string = ' '.join([str(e) for e in df])
    return string

In [None]:
df['keywords'] = df['keywords'].apply(listToString)
df['no_words_inSent_SK'] = df.apply(lambda x: W_sourceKeywords(x['source'], x['keywords']), axis=1)
df.head()

In [None]:
#df['keywords'] = df['keywords'].apply(listToString)
print(df['no_words_inSent_SK'][0], "\n\n")

In [None]:
df = df.drop(columns='keywords2')

In [None]:
df['no_words_inSent_SS'] = df.apply(lambda x: W_sourceSummary(x['source'], x['summary']), axis=1)
df.head()

In [None]:
df.to_csv('G:/Extractive-Summarisation-of-German-Wikipedia/dataset/data_train.csv', encoding='utf-8', index=False)

In [None]:
tok1 = sent_tokenize(df['source'][0], language="german")
print(tok2, "\n", "\n", tok1[0])