In [11]:
import pandas as pd
import spacy
import numpy as np
from collections import defaultdict
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# --- 1. Утверждения для каждой стадии TRL ---
TRL_STATEMENTS = {
    'Research (TRL 1-3)': [
        "Initial theoretical concepts have been proposed.",
        "The technology is currently under academic investigation.",
        "Only laboratory-scale results are available so far.",
        "No practical implementation has been achieved.",
        "The feasibility is being evaluated in simulated environments.",
        "The technology is still a scientific hypothesis.",
        "There is no commercial application at this stage.",
        "Experiments are being conducted under controlled conditions.",
        "The concept requires further validation.",
        "The technology has not progressed beyond basic research."
    ],
    'Prototyping (TRL 4-6)': [
        "A prototype has been developed and tested in limited settings.",
        "Initial field testing has been conducted successfully.",
        "Demonstration systems have been built for evaluation.",
        "The system operates in a controlled operational environment.",
        "Pilot studies are currently ongoing.",
        "The solution has been demonstrated at technology expos.",
        "Small-scale deployment is being tested.",
        "The prototype is functional but not optimized.",
        "First trials have begun in industry-specific scenarios.",
        "The technology is moving from lab to real-world conditions."
    ],
    'Deployment (TRL 7-8)': [
        "The system is being integrated into business workflows.",
        "Real-world validation has shown reliable performance.",
        "The solution is used in early-stage commercial operations.",
        "The technology is being adopted in limited production.",
        "Customers have begun to apply the solution in practice.",
        "Regulatory approvals have been obtained for testing.",
        "Several industry partners are involved in the rollout.",
        "Commercial pilot programs are underway.",
        "The solution is being procured by selected clients.",
        "Key performance metrics are being monitored in production."
    ],
    'Widespread Adoption (TRL 9)': [
        "The technology is widely adopted in industry.",
        "It is part of standardized processes across sectors.",
        "Products using this technology are available on the market.",
        "The solution is mass-produced and globally distributed.",
        "Certifications and compliance standards are in place.",
        "Multiple vendors now offer the same type of solution.",
        "The market has reached maturity for this technology.",
        "Training programs include this technology as a core module.",
        "Integration with existing IT ecosystems is seamless.",
        "Governments and corporations rely on it at scale."
    ],
    'Decline': [
        "Investment in the technology is decreasing.",
        "Interest has shifted toward newer alternatives.",
        "Market share is shrinking year over year.",
        "The product line has been discontinued.",
        "The technology is considered obsolete in most applications.",
        "Support and maintenance are being phased out.",
        "Only legacy systems still use the technology.",
        "It is mentioned mostly in retrospective analyses.",
        "Academic interest has declined significantly.",
        "Companies are migrating to next-generation solutions."
    ]
}

# --- 2. Нечётко-вероятностный подход с утверждениями ---
nlp = spacy.load("en_core_web_sm")


def analyze_technology_text(text: str, technology_keyword: str):
    doc = nlp(text.lower())
    technology_keyword = technology_keyword.lower()

    trl_scores = defaultdict(float)
    matched_statements = []

    for stage, statements in TRL_STATEMENTS.items():
        for statement in statements:
            if statement.lower() in text.lower():
                if stage == 'Research (TRL 1-3)':
                    trl_scores['TRL 1-3'] += 1.0
                elif stage == 'Prototyping (TRL 4-6)':
                    trl_scores['TRL 4-6'] += 1.0
                elif stage == 'Deployment (TRL 7-8)':
                    trl_scores['TRL 7-8'] += 1.0
                elif stage == 'Widespread Adoption (TRL 9)':
                    trl_scores['TRL 9'] += 1.0
                elif stage == 'Decline':
                    trl_scores['Decline'] += 1.0
                matched_statements.append((statement, stage))

    total_score = sum(trl_scores.values())
    if total_score == 0:
        return {"error": "No relevant statements found"}

    normalized_scores = {k: v / total_score for k, v in trl_scores.items()}
    dominant_stage = max(normalized_scores, key=normalized_scores.get) if normalized_scores else "Unknown"

    return {
        "technology_keyword": technology_keyword,
        "estimated_stage": dominant_stage,
        "stage_scores": normalized_scores,
        "matched_statements": matched_statements
    }


# --- 3. Трансформерная модель с SciBERT ---
# Загрузка данных
df = pd.read_excel('table.xlsx')


# Предобработка: преобразование TRL в числовой формат
def extract_trl(level):
    if 'TRL' in str(level):
        try:
            return int(level.split('TRL')[1].strip().split()[0])
        except:
            return None
    elif 'fully commercialized' in str(level).lower():
        return 9
    elif 'demonstration' in str(level).lower():
        return 6
    return None


df['TRL'] = df['Уровень развития технологии'].apply(extract_trl)
df = df.dropna(subset=['TRL'])


# Преобразование TRL в классы 0-3, Decline=4
def trl_to_class(trl):
    if trl in [1, 2, 3]:
        return 0
    elif trl in [4, 5, 6]:
        return 1
    elif trl in [7, 8]:
        return 2
    elif trl == 9:
        return 3
    else:  # Decline
        return 4


df['class'] = df['TRL'].apply(trl_to_class)

# Добавление утверждений в обучающий набор
statement_data = []
for stage, statements in TRL_STATEMENTS.items():
    if stage == 'Research (TRL 1-3)':
        trl_class = 0
    elif stage == 'Prototyping (TRL 4-6)':
        trl_class = 1
    elif stage == 'Deployment (TRL 7-8)':
        trl_class = 2
    elif stage == 'Widespread Adoption (TRL 9)':
        trl_class = 3
    else:  # Decline
        trl_class = 4
    for statement in statements:
        statement_data.append({'Предложение': statement, 'class': trl_class})

statement_df = pd.DataFrame(statement_data)
df = pd.concat([df[['Предложение', 'class']], statement_df], ignore_index=True)

# Разделение данных
X = df['Предложение'].tolist()
y = df['class'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Загрузка SciBERT
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased',
                                                           num_labels=5)  # 5 классов


# Подготовка данных
def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=128, return_tensors='pt')


X_train_tokenized = tokenize_function(X_train)
X_test_tokenized = tokenize_function(X_test)

# Обучение модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    model.train()
    for i in range(0, len(X_train), 16):
        batch = {k: X_train_tokenized[k][i:i + 16].to(device) for k in X_train_tokenized}
        labels = torch.tensor(y_train[i:i + 16], dtype=torch.long).to(device)
        outputs = model(**batch, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Предсказание
model.eval()
predictions = []
with torch.no_grad():
    for i in range(0, len(X_test), 16):
        batch = {k: X_test_tokenized[k][i:i + 16].to(device) for k in X_test_tokenized}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)

accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')
print(f'Точность модели: {accuracy:.3f}')
print(f'F1-Score: {f1:.3f}')
print("Точность модели: {:.3f}".format(accuracy))
print("Programm is finished")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Точность модели: 0.357
F1-Score: 0.210
Точность модели: 0.357
Programm is finished


In [14]:
print(f'Точность модели: {accuracy:.3f}')
print(f'F1-Score: {f1:.3f}')
print("Точность модели: {:.3f}".format(accuracy))
print("Programm is finished")

Точность модели: 0.357
F1-Score: 0.210
Точность модели: 0.357
Programm is finished
